In [26]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from scipy import stats

# Upload and Transform Data

In [3]:
# Read the CSV file into a Pandas DataFrame

districts = pd.read_csv('District_data.csv')
districts = districts.replace('redacted',0)
districts.head()

Unnamed: 0,District,County,District_type,Enrollment,FRL_Perc,Disadv_Perc,EL_Perc,Grad_Perc,Teach_to_stud,Susp_Perc,Chronic_absent,Math_metAbove,ELA_metAbove,Per_pupil_exp,Teacher_salary,Avg Years Teaching (District)
0,Happy Camp Union Elementary (Siskiyou),Siskiyou,Elementary School District,110,77.3,77.27,,,,7.9,29.5,9.23,17.91,13585,76081,
1,Shoreline Unified (Marin),Marin,Unified School District,508,66.9,68.9,42.1,94.3,,3.7,17.7,27.41,43.63,29742,87808,
2,Cienega Union Elementary (San Benito),San Benito,Elementary School District,25,32.0,44.0,28.0,,0.0,0.0,6.3,35.0,42.11,11515,76081,
3,Alpine County Office of Education (Alpine),Alpine,County Office of Education (COE),6023,0.0,0.0,,,0.0,,,37.2,48.3,14708,76081,
4,Arena Union Elementary/Point Arena Joint Union...,Mendocino,Common Administration District,6023,56.8,60.0,18.4,,20.2,3.5,13.1,37.3,48.3,22151,57730,


In [4]:
# Review data types.
districts.dtypes

District                          object
County                            object
District_type                     object
Enrollment                         int64
FRL_Perc                         float64
Disadv_Perc                      float64
EL_Perc                          float64
Grad_Perc                        float64
Teach_to_stud                    float64
Susp_Perc                         object
Chronic_absent                    object
Math_metAbove                    float64
ELA_metAbove                     float64
Per_pupil_exp                      int64
Teacher_salary                     int64
Avg Years Teaching (District)    float64
dtype: object

In [5]:
# Rename average years of teaching column.
districts = districts.rename(columns={"Avg Years Teaching (District)": "Avg_years_teaching"})
districts

Unnamed: 0,District,County,District_type,Enrollment,FRL_Perc,Disadv_Perc,EL_Perc,Grad_Perc,Teach_to_stud,Susp_Perc,Chronic_absent,Math_metAbove,ELA_metAbove,Per_pupil_exp,Teacher_salary,Avg_years_teaching
0,Happy Camp Union Elementary (Siskiyou),Siskiyou,Elementary School District,110,77.3,77.27,,,,7.9,29.5,9.23,17.91,13585,76081,
1,Shoreline Unified (Marin),Marin,Unified School District,508,66.9,68.90,42.1,94.3,,3.7,17.7,27.41,43.63,29742,87808,
2,Cienega Union Elementary (San Benito),San Benito,Elementary School District,25,32.0,44.00,28.0,,0.0,0,6.3,35.00,42.11,11515,76081,
3,Alpine County Office of Education (Alpine),Alpine,County Office of Education (COE),6023,0.0,0.00,,,0.0,,,37.20,48.30,14708,76081,
4,Arena Union Elementary/Point Arena Joint Union...,Mendocino,Common Administration District,6023,56.8,60.00,18.4,,20.2,3.5,13.1,37.30,48.30,22151,57730,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,Lincoln Elementary (Marin),Marin,Elementary School District,4,0.0,0.00,,,0.0,0,0,37.20,48.30,86414,76081,1.0
1032,SBE - KIPP Bayview Elementary (San Francisco),San Francisco,State Board of Education Charter,140,90.0,90.00,2.1,,35.0,7.4,37.1,37.20,48.30,14708,76081,1.0
1033,SBE - KIPP Navigate College Prep (Santa Clara),Santa Clara,State Board of Education Charter,178,75.3,82.02,14.6,,22.3,7.1,11.3,37.20,48.30,14708,76081,1.0
1034,Forks of Salmon Elementary (Siskiyou),Siskiyou,Elementary School District,9,100.0,100.00,18.4,,9.0,3.5,13.1,37.30,48.30,31008,76081,1.0


In [6]:
# Replace NaaNs with means.
districts = districts.fillna(districts.mean())

# Predicting ELA Scores

In [7]:
# Assign the data to X and y

X = districts[["Enrollment","FRL_Perc","Teach_to_stud","Per_pupil_exp","Teacher_salary","Avg_years_teaching"]]
y = districts["ELA_metAbove"]

print("Shape: ", X.shape, y.shape)

Shape:  (1036, 6) (1036,)


In [None]:
# Plot the data to find out if a linear trend exists

plt.scatter(X, y)
plt.xlabel("Teacher Salary")
plt.ylabel("Math Achievement")

In [8]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [9]:
# Create the model

from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [10]:
# Fit the model to the training data. 

model.fit(X_train, y_train)


LinearRegression()

In [11]:
# Calculate the mean_squared_error and the r-squared value for the testing data

from sklearn.metrics import mean_squared_error, r2_score

# Use our model to make predictions
predicted = model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

# Print key statistics, including the R-squared and model parameters.
print(f"mean squared error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
print(model.coef_)
print(model.intercept_)


mean squared error (MSE): 108.88437815619356
R-squared (R2): 0.5797436478318847
[ 3.11068001e-05 -4.97715486e-01  8.64731213e-02 -1.50586939e-05
  1.45718894e-04  5.50803954e-01]
58.58319955033581


# Predicting Math Scores

In [17]:
# Assign the data to X and y

X = districts[["Enrollment","FRL_Perc","Teach_to_stud","Per_pupil_exp","Teacher_salary","Avg_years_teaching"]]
y = districts["Math_metAbove"]

print("Shape: ", X.shape, y.shape)

Shape:  (1036, 6) (1036,)


In [18]:
 # Use the Sklearn `train_test_split()` function to split the data into training and testing data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [19]:
# Create the model.
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [20]:
# Fit the model to the training data.
model.fit(X_train, y_train)

LinearRegression()

In [32]:
# Calculate the mean_squared_error and the r-squared value for the testing data

from sklearn.metrics import mean_squared_error, r2_score

# Use our model to make predictions
predicted = model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

# Print key statistics, including the R-squared and model parameters.
print(f"mean squared error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
print(model.coef_)
print(model.intercept_)

# HOW DO WE PRINT THE SUMMARY STATISTICS???

mean squared error (MSE): 128.09268952301048
R-squared (R2): 0.5390818194092863
[-1.77046232e-05 -5.23587486e-01  7.22478883e-02 -8.19078377e-06
  1.76546878e-04  6.18227879e-01]
46.63165530840816


AttributeError: 'LinearRegression' object has no attribute 'summary'