In [2]:
#Import libraries/packages
import pandas as pd
import numpy as np
import os 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score


In [3]:
#Loading the dataframe
filename = os.path.join(os.getcwd(), "dataframes", "actMath_11_encoded_df.csv")
actMath11_df = pd.read_csv(filename, header = 0)

In [8]:
actMath11_df.head()

Unnamed: 0,studentId,is_proficient,proficient_score,proficient_diff,school_High School A,school_High School B,school_High School C,school_High School D,iready_math,iready_reading,...,category_PERFORMING_ARTS,category_VISUAL_ARTS,category_TECHNOLOGY,category_BUSINESS,category_TRADES,category_AGRICULTURE,category_PHYSICAL_EDUCATION,category_HEALTH,category_CAREER_PREP,category_MILITARY_SCIENCE
0,45054,False,0.529412,-8.0,True,False,False,False,0,0,...,False,False,False,False,True,False,False,False,False,False
1,45054,False,0.529412,-8.0,True,False,False,False,0,0,...,False,False,False,False,True,False,False,False,False,False
2,45440,False,0.941176,-1.0,False,False,False,False,0,0,...,False,False,False,False,False,False,False,False,False,False
3,45440,False,0.941176,-1.0,False,False,False,False,0,0,...,False,False,False,False,False,False,False,False,False,False
4,45440,False,0.941176,-1.0,False,False,False,False,0,0,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# Get a list of columns with only True and False values
true_false_columns = actMath11_df.columns[actMath11_df.apply(lambda col: col.isin([True, False]).all())]

# Convert to a list
true_false_columns_list = true_false_columns.tolist()

#Make true false 1 and 0
actMath11_df[true_false_columns] = actMath11_df[true_false_columns].astype(int)


In [14]:
feature_list = list(actMath11_df.select_dtypes(include = ['int']).columns)
feature_list.remove('is_proficient')

In [15]:
feature_list

['studentId',
 'school_High School A',
 'school_High School B',
 'school_High School C',
 'school_High School D',
 'iready_math',
 'iready_reading',
 'bbb_literacy',
 'category_MATHEMATICS',
 'category_ENGLISH',
 'category_SCIENCE',
 'category_SOCIAL_STUDIES',
 'category_FOREIGN_LANGUAGE',
 'category_PERFORMING_ARTS',
 'category_VISUAL_ARTS',
 'category_TECHNOLOGY',
 'category_BUSINESS',
 'category_TRADES',
 'category_AGRICULTURE',
 'category_PHYSICAL_EDUCATION',
 'category_HEALTH',
 'category_CAREER_PREP',
 'category_MILITARY_SCIENCE']

In [16]:
# YOUR CODE HERE
y = actMath11_df['is_proficient']
X = actMath11_df[feature_list]

print("Number of examples: " + str(X.shape[0]))
print("\nNumber of Features:" + str(X.shape[1]))
print(str(list(X.columns)))

Number of examples: 3837

Number of Features:23
['studentId', 'school_High School A', 'school_High School B', 'school_High School C', 'school_High School D', 'iready_math', 'iready_reading', 'bbb_literacy', 'category_MATHEMATICS', 'category_ENGLISH', 'category_SCIENCE', 'category_SOCIAL_STUDIES', 'category_FOREIGN_LANGUAGE', 'category_PERFORMING_ARTS', 'category_VISUAL_ARTS', 'category_TECHNOLOGY', 'category_BUSINESS', 'category_TRADES', 'category_AGRICULTURE', 'category_PHYSICAL_EDUCATION', 'category_HEALTH', 'category_CAREER_PREP', 'category_MILITARY_SCIENCE']


In [17]:
#Create training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=1234)

In [18]:
print(X_train.shape)
print(X_test.shape)

(2570, 23)
(1267, 23)


In [19]:
# 1. Create the LogisticRegression model object below and assign to variable 'model'
model = LogisticRegression()

# 2. Fit the model to the training data below
model.fit(X_train, y_train)

# 3. Make predictions on the test data using the predict_proba() method and assign the 
# result to the variable 'probability_predictions' below
probability_predictions = model.predict_proba(X_test)

# print the first 5 probability class predictions
df_print = pd.DataFrame(probability_predictions, columns = ['Class: False', 'Class: True'])
print('Class Prediction Probabilities: \n' + df_print[0:5].to_string(index=False))

# 4. Compute the log loss on 'probability_predictions' and save the result to the variable
# 'l_loss' below
l_loss = log_loss(y_test, probability_predictions)
print('Log loss: ' + str(l_loss))

# 5. Make predictions on the test data using the predict() method and assign the result 
# to the variable 'class_label_predictions' below
class_label_predictions = model.predict(X_test)

# print the first 5 class label predictions 
print('Class labels: ' + str(class_label_predictions[0:5]))

# 6.Compute the accuracy score on 'class_label_predictions' and save the result 
# to the variable 'acc_score' below

acc_score = accuracy_score(y_test, class_label_predictions)
print('Accuracy: ' + str(acc_score))




Class Prediction Probabilities: 
 Class: False  Class: True
     0.706375     0.293625
     0.232794     0.767206
     0.144206     0.855794
     0.706384     0.293616
     0.756646     0.243354
Log loss: 0.5703094910147
Class labels: [0 1 1 0 0]
Accuracy: 0.7198105761641673


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
def computeAccuracy(threshold_value):
    
    labels=[]
    for p in probability_predictions[:,0]:
        if p >= threshold_value:
            labels.append(False)
        else:
            labels.append(True)
    
    acc_score = accuracy_score(y_test, labels)
    return acc_score

In [21]:
thresholds = [0.44, 0.50, 0.55, 0.67, 0.75]
for t in thresholds:
    print("Threshold value {:.2f}: Accuracy {}".format(t, str(computeAccuracy(t))))

Threshold value 0.44: Accuracy 0.7198105761641673
Threshold value 0.50: Accuracy 0.7198105761641673
Threshold value 0.55: Accuracy 0.7182320441988951
Threshold value 0.67: Accuracy 0.6921862667719021
Threshold value 0.75: Accuracy 0.5295974743488555
