In [138]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import streamlit as sl
import matplotlib.pyplot as plt

#### 1. Read in the data, call the dataframe "s"  and check the dimensions of the dataframe


In [139]:
s=pd.read_csv('social_media_usage.csv')

#### 2. Define a function called clean_sm that takes one input, x, and uses `np.where` to check whether x is equal to 1. If it is, make the value of x = 1, otherwise make it 0. Return x. Create a toy dataframe with three rows and two columns and test your function to make sure it works as expected


In [140]:
def clean_sm(x):
    return np.where(x == 1, 1, 0)
data = {
    'col1': [1, 5, 1,0,6,7,8,9,1,2,3,5],
    'col2': [0, 1, 0,4,5,0,6,7,8,9,5,1]
}
df = pd.DataFrame(data)

df_cleaned = df.applymap(clean_sm)

print("Original DataFrame:")
print(df)
print("\nCleaned DataFrame:")
print(df_cleaned)

Original DataFrame:
    col1  col2
0      1     0
1      5     1
2      1     0
3      0     4
4      6     5
5      7     0
6      8     6
7      9     7
8      1     8
9      2     9
10     3     5
11     5     1

Cleaned DataFrame:
    col1  col2
0      1     0
1      0     1
2      1     0
3      0     0
4      0     0
5      0     0
6      0     0
7      0     0
8      1     0
9      0     0
10     0     0
11     0     1


  df_cleaned = df.applymap(clean_sm)


#### 3. Create a new dataframe called "ss". The new dataframe should contain a target column called sm_li which should be a binary variable ( that takes the value of 1 if it is 1 and 0 otherwise (use clean_sm to create this) which indicates whether or not the individual uses LinkedIn, and the following features: income (ordered numeric from 1 to 9, above 9 considered missing), education (ordered numeric from 1 to 8, above 8 considered missing), parent (binary), married (binary), female (binary), and age (numeric, above 98 considered missing). Drop any missing values. Perform exploratory analysis to examine how the features are related to the target.

In [141]:
ss=pd.DataFrame()
ss['sm_li']=s['web1h'].apply(clean_sm)
ss['income']=s['income'].apply(lambda x: x if x <98 else float('nan'))
ss['is_parent']=s['par'].apply(clean_sm)
ss['is_married']=s['marital'].apply(clean_sm)
ss['educ2']=s['educ2'].apply(lambda x: x if x <98 else float('nan'))
ss['is_female']=s['gender'].apply(lambda x: 1 if x== 2 else 0)
ss['age_years']=s['age'].apply(lambda x: x if x != 98 else float('nan'))
ss=ss.dropna()
ss

Unnamed: 0,sm_li,income,is_parent,is_married,educ2,is_female,age_years
0,0,6,0,0,4.0,1,77.0
1,0,5,0,0,3.0,0,59.0
2,0,8,0,1,4.0,1,60.0
3,0,8,0,0,8.0,0,73.0
4,1,7,0,1,8.0,1,65.0
...,...,...,...,...,...,...,...
1497,0,6,1,1,4.0,0,41.0
1498,1,2,0,0,5.0,0,46.0
1499,0,7,0,0,3.0,0,18.0
1500,0,4,0,0,3.0,1,23.0


#### 4. Create a target vector (y) and feature set (X)


In [142]:
y = ss['sm_li'] 
X = ss.drop(columns=['sm_li']) 

#### 5. Split the data into training and test sets. Hold out 20% of the data for testing. Explain what each new object contains and how it is used in machine learning

In [143]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### 6. Initialize a logistic regression model and set class_weight to balanced. Fit the model with the training data.

In [144]:
#Initialize Model
model = LogisticRegression(class_weight='balanced')

In [145]:
#Train model
model.fit(X_train, y_train)

#### 7. Evaluate the model using the testing data. What is the model accuracy for the model? Use the model to make predictions and then generate a confusion matrix from the model. Interpret the confusion matrix and explain what each number means.

In [146]:
#Predict on Test Set
y_pred = model.predict(X_test)

In [147]:
cm=confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix: \n", cm)


Confusion Matrix: 
 [[127  72]
 [ 33  64]]


#### 8. Create the confusion matrix as a dataframe and add informative column names and index names that indicate what each quadrant represents


In [148]:
cm_df=pd.DataFrame(cm,index=["Actual Negative (0)", "Actual Positive (1)"], 
                     columns=["Predicted Negative (0)", "Predicted Positive (1)"])
cm_df

Unnamed: 0,Predicted Negative (0),Predicted Positive (1)
Actual Negative (0),127,72
Actual Positive (1),33,64


#### 9. Aside from accuracy, there are three other metrics used to evaluate model performance: precision, recall, and F1 score. Use the results in the confusion matrix to calculate each of these metrics by hand. Discuss each metric and give an actual example of when it might be the preferred metric of evaluation. After calculating the metrics by hand, create a classification_report using sklearn and check to ensure your metrics match those of the classification_report.


In [149]:
# Confusion matrix values
TP = 62  # True Positives
FP = 68  # False Positives
FN = 24  # False Negatives
TN = 142  # True Negatives
# Calculating precision, recall, and F1 score by hand
precision = TP / (TP + FP) 
recall = TP / (TP + FN) 
f1_score = 2 * (precision * recall) / (precision + recall) 
print("\nPrecision (by hand):", precision)
print("\nRecall (by hand):", recall)
print("\nF1 Score (by hand):", f1_score)

#Confirm with classification report
print("\n\nClassification Report (sklearn) :\n", classification_report(y_test, y_pred, target_names=["Is not a LinkedIn user (0)", "Is a LinkedIn user (1)"]))



Precision (by hand): 0.47692307692307695

Recall (by hand): 0.7209302325581395

F1 Score (by hand): 0.5740740740740741


Classification Report (sklearn) :
                             precision    recall  f1-score   support

Is not a LinkedIn user (0)       0.79      0.64      0.71       199
    Is a LinkedIn user (1)       0.47      0.66      0.55        97

                  accuracy                           0.65       296
                 macro avg       0.63      0.65      0.63       296
              weighted avg       0.69      0.65      0.66       296



#### 10. Use the model to make predictions. For instance, what is the probability that a high income (e.g. income=8), with a high level of education (e.g. 7), non-parent who is married female and 42 years old uses LinkedIn? How does the probability change if another person is 82 years old, but otherwise the same?


#### Case 1: For the 42 year old woman, she has a ~68% probability of being a LinkedIn user, and a ~32% probability of not being a LinkedinUser

#### Case 2: For the 82 year old woman, she has a ~46% probability of being a LinkedIn user, and a ~53% probability of not being a LinkedinUser

In [150]:
case_1=pd.DataFrame(model.predict_proba(pd.DataFrame([{
  'income': 8,
  'is_parent': 0,
  'is_married': 1,
  'educ2': 7.0,
  'is_female': 1,
  'age_years': 42.0
}])), columns=['Negative Class Probability','Positive Class Probability'],index=['Case 1'])
case_2=pd.DataFrame(model.predict_proba(pd.DataFrame([{
  'income': 8,
  'is_parent': 0,
  'is_married': 1,
  'educ2': 7.0,
  'is_female': 1,
  'age_years': 82.0
}])), columns=['Negative Class Probability','Positive Class Probability'],index=['Case 2'])
pd.concat([case_1,case_2])

Unnamed: 0,Negative Class Probability,Positive Class Probability
Case 1,0.308185,0.691815
Case 2,0.549627,0.450373
