In [None]:
 %matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Upload and Transform Data

In [None]:
# Read the CSV file into a Pandas DataFrame

districts = pd.read_csv('District_data.csv')
districts = districts.replace('redacted',0)
districts.head()

In [None]:
# Review data types.
districts.dtypes

In [None]:
# Create a binary variable showing whether a district has above 50% proficiency or below.
districts['Math_metAbove50'] = np.where(districts['Math_metAbove'] >= 50, True, False)
districts['ELA_metAbove50'] = np.where(districts['ELA_metAbove'] >= 50, True, False)
districts['Math_metAbove50'] = districts['Math_metAbove50'].astype(int)
districts['ELA_metAbove50'] = districts['ELA_metAbove50'].astype(int)
districts.head(30)

In [None]:
# Rename average years of teaching column.
districts = districts.rename(columns={"Avg Years Teaching (District)": "Avg_years_teaching"})
districts

In [None]:
# Replace NaaNs with means.
districts = districts.fillna(districts.mean())

# Predicting ELA Proficiency

In [None]:
# Assign the data to X and y

X = districts[["Enrollment","FRL_Perc","Teach_to_stud","Per_pupil_exp","Teacher_salary","Avg_years_teaching"]]
y = districts["ELA_metAbove50"]

print("Shape: ", X.shape, y.shape)


In [None]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [None]:
# Create the model

classifier = LogisticRegression()
classifier

In [None]:
# Fit the model to the training data. 

classifier.fit(X_train, y_train) 

In [None]:
# Print training and testing scores.

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
# Create predictions.
y_pred = classifier.predict(X_test)
print(y_pred)

In [None]:
# Print accuracy score.
print(accuracy_score(y_test, y_pred))

In [None]:
# Create confusion matrix.
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

In [None]:
# Print the classification report.
report = classification_report(y_test, y_pred)
print(report)

# Predicting Math Scores

In [32]:
# Assign the data to X and y

X = districts[["Enrollment","FRL_Perc","Teach_to_stud","Per_pupil_exp","Teacher_salary","Avg_years_teaching"]]
y = districts["Math_metAbove50"]

print("Shape: ", X.shape, y.shape)

Shape:  (1036, 6) (1036,)


In [33]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [34]:
# Create the model

classifier = LogisticRegression()
classifier

LogisticRegression()

In [35]:
# Fit the model to the training data. 

classifier.fit(X_train, y_train) 

LogisticRegression()

In [36]:
# Print training and testing scores.

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.879021879021879
Testing Data Score: 0.8996138996138996


In [37]:
# Create predictions.
y_pred = classifier.predict(X_test)
print(y_pred)

[1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 1 0 1 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0]


In [38]:
# Print accuracy score.
print(accuracy_score(y_test, y_pred))

0.8996138996138996


In [39]:
# Create confusion matrix.
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[194  15]
 [ 11  39]]


In [40]:
# Print the classification report.
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       209
           1       0.72      0.78      0.75        50

    accuracy                           0.90       259
   macro avg       0.83      0.85      0.84       259
weighted avg       0.90      0.90      0.90       259

