In [75]:
 %matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import sqlalchemy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Upload and Transform Data

In [76]:
# Create the engine to connect to the PostgreSQL database
engine = sqlalchemy.create_engine('postgresql://postgres:districtdata@districtdata.ckva5djfpzaj.us-east-1.rds.amazonaws.com:5432/District Data')
sql_query='''
SELECT * FROM district_demographics
JOIN district_scores
ON district_demographics.district=district_scores.district
'''
my_dataframe=pd.read_sql(sql_query,con=engine)
my_dataframe.head()

Unnamed: 0,district,county,district_type,enrollment,frl,disadv,el,grad,teach_to_stud,susp,chronic_absent,per_pupil_exp,teacher_salary,avg_yrs_teaching,district.1,math_metabove,ela_metabove
0,Happy Camp Union Elementary (Siskiyou),Siskiyou,Elementary School District,110,77.3,77.27,,,,7.9,29.5,13585,76081,,Happy Camp Union Elementary (Siskiyou),9.23,17.91
1,Shoreline Unified (Marin),Marin,Unified School District,508,66.9,68.9,42.1,94.3,,3.7,17.7,29742,87808,,Shoreline Unified (Marin),27.41,43.63
2,Cienega Union Elementary (San Benito),San Benito,Elementary School District,25,32.0,44.0,28.0,,0.0,0.0,6.3,11515,76081,,Cienega Union Elementary (San Benito),35.0,42.11
3,Alpine County Office of Education (Alpine),Alpine,County Office of Education (COE),6023,0.0,0.0,,,0.0,,,14708,76081,,Alpine County Office of Education (Alpine),37.2,48.3
4,Arena Union Elementary/Point Arena Joint Union...,Mendocino,Common Administration District,6023,56.8,60.0,18.4,,20.2,3.5,13.1,22151,57730,,Arena Union Elementary/Point Arena Joint Union...,37.3,48.3


In [None]:
# # Read the CSV file into a Pandas DataFrame
# districts = pd.read_csv('District_data.csv')
# districts = districts.replace('redacted',0)
# districts.head()

In [None]:
# Review data types.
districts.dtypes

In [None]:
# Create a binary variable showing whether a district has above 50% proficiency or below.
districts['Math_metAbove50'] = np.where(districts['Math_metAbove'] >= 50, True, False)
districts['ELA_metAbove50'] = np.where(districts['ELA_metAbove'] >= 50, True, False)
districts['Math_metAbove50'] = districts['Math_metAbove50'].astype(int)
districts['ELA_metAbove50'] = districts['ELA_metAbove50'].astype(int)
districts.head(30)

In [None]:
# Rename average years of teaching column.
districts = districts.rename(columns={"Avg Years Teaching (District)": "Avg_years_teaching"})
districts

In [None]:
# Replace NaaNs with means.
districts = districts.fillna(districts.mean())

# Could try median, maximum, minimum, mode, etc.

In [None]:
# Look at distribution of values (histogram). Do this higher up.

# Predicting ELA Proficiency

In [None]:
# Assign the data to X and y

X = districts[["Enrollment","FRL_Perc","Teach_to_stud","Per_pupil_exp","Teacher_salary","Avg_years_teaching"]]
y = districts["ELA_metAbove50"]

print("Shape: ", X.shape, y.shape)


In [77]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=42)


In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create the model

classifier = LogisticRegression()
classifier

In [None]:
# Fit the model to the training data. 

classifier.fit(X_train, y_train) 

In [None]:
# Print training and testing scores.

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
# Create predictions.
y_pred = classifier.predict(X_test)
print(y_pred)

In [None]:
# Print accuracy score.
print(accuracy_score(y_test, y_pred))

In [None]:
# Create confusion matrix.
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

In [None]:
# Print the classification report.
report = classification_report(y_test, y_pred)
print(report)

# Predicting Math Scores

In [None]:
# Assign the data to X and y

X = districts[["Enrollment","FRL_Perc","Teach_to_stud","Per_pupil_exp","Teacher_salary","Avg_years_teaching"]]
y = districts["Math_metAbove50"]

print("Shape: ", X.shape, y.shape)

In [None]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=42)


In [None]:
# Create the model

classifier = LogisticRegression()
classifier

In [None]:
# Fit the model to the training data. 

classifier.fit(X_train, y_train) 

In [None]:
# Print training and testing scores.

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
# Create predictions.
y_pred = classifier.predict(X_test)
print(y_pred)

In [None]:
# Print accuracy score.
print(accuracy_score(y_test, y_pred))

In [None]:
# Create confusion matrix.
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

In [None]:
# Print the classification report.
report = classification_report(y_test, y_pred)
print(report)