In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Read the CSV file into a Pandas DataFrame

districts = pd.read_csv('District_data.csv')
districts = districts.replace('redacted',0)
districts.head()

In [145]:
# Review data types.
districts.dtypes

District               object
County                 object
District_type          object
Enrollment              int64
FRL_Perc              float64
Disadv_Perc           float64
EL_Perc               float64
Grad_Perc             float64
Teach_to_stud         float64
Susp_Perc              object
Chronic_absent         object
Math_metAbove         float64
ELA_metAbove          float64
Per_pupil_exp           int64
Teacher_salary          int64
Avg_years_teaching    float64
dtype: object

In [None]:
# Rename average years of teaching column.
districts = districts.rename(columns={"Avg Years Teaching (District)": "Avg_years_teaching"})
districts

In [141]:
# Observe means.

Enrollment             6023.941120
FRL_Perc                 56.800965
Disadv_Perc              59.862452
EL_Perc                  18.437052
Grad_Perc                84.543100
Teach_to_stud            20.193016
Math_metAbove            37.324440
ELA_metAbove             48.332442
Per_pupil_exp         14708.595560
Teacher_salary        76081.446911
Avg_years_teaching        9.894221
dtype: float64

In [144]:
### Replace NaaNs with means.
districts = districts.fillna(districts.mean())

Unnamed: 0,District,County,District_type,Enrollment,FRL_Perc,Disadv_Perc,EL_Perc,Grad_Perc,Teach_to_stud,Susp_Perc,Chronic_absent,Math_metAbove,ELA_metAbove,Per_pupil_exp,Teacher_salary,Avg_years_teaching
0,Happy Camp Union Elementary (Siskiyou),Siskiyou,Elementary School District,110,77.3,77.27,18.437052,84.5431,20.193016,7.9,29.5,9.23,17.91,13585,76081,9.894221
1,Shoreline Unified (Marin),Marin,Unified School District,508,66.9,68.9,42.1,94.3,20.193016,3.7,17.7,27.41,43.63,29742,87808,9.894221
2,Cienega Union Elementary (San Benito),San Benito,Elementary School District,25,32.0,44.0,28.0,84.5431,0.0,0.0,6.3,35.0,42.11,11515,76081,9.894221
3,Alpine County Office of Education (Alpine),Alpine,County Office of Education (COE),6023,0.0,0.0,18.437052,84.5431,0.0,,,37.2,48.3,14708,76081,9.894221
4,Arena Union Elementary/Point Arena Joint Union...,Mendocino,Common Administration District,6023,56.8,60.0,18.4,84.5431,20.2,3.5,13.1,37.3,48.3,22151,57730,9.894221
5,Modesto City Schools (Stanislaus),Stanislaus,Common Administration District,6023,56.8,60.0,18.4,84.5431,20.2,3.5,13.1,37.3,48.3,13243,93066,9.894221
6,Petaluma City Elementary/Joint Union High (Son...,Sonoma,Common Administration District,6023,56.8,60.0,18.4,84.5431,20.2,3.5,13.1,37.3,48.3,12477,72581,9.894221
7,Santa Cruz City Elementary/High (Santa Cruz),Santa Cruz,Common Administration District,6023,56.8,60.0,18.4,84.5431,20.2,3.5,13.1,37.3,48.3,14533,77764,9.894221
8,Santa Rosa City Schools (Sonoma),Sonoma,Common Administration District,6023,56.8,60.0,18.4,84.5431,20.2,3.5,13.1,37.3,48.3,13576,74869,9.894221
9,SBE - Celerity Rolas (Los Angeles),Los Angeles,State Board of Education Charter,6023,56.8,60.0,18.4,84.5431,20.2,3.5,13.1,37.3,48.3,14708,76081,9.894221


In [146]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values so we use reshape() to create this

X = districts[["Enrollment","FRL_Perc","Teach_to_stud","Per_pupil_exp","Teacher_salary","Avg_years_teaching"]]
y = districts["ELA_metAbove"]

print("Shape: ", X.shape, y.shape)

Shape:  (1036, 6) (1036,)


In [None]:
 # Plot the data to find out if a linear trend exists

### BEGIN SOLUTION
plt.scatter(X, y)
plt.xlabel("Teacher Salary")
plt.ylabel("Math Achievement")
### END SOLUTION

In [147]:
 # Use the Sklearn `train_test_split()` function to split the data into training and testing data

from sklearn.model_selection import train_test_split

### BEGIN SOLUTION

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### END SOLUTION

In [148]:
 # Create the model

### BEGIN SOLUTION

from sklearn.linear_model import LinearRegression

model = LinearRegression()

### END SOLUTION

In [149]:
# Fit the model to the training data. 

### BEGIN SOLUTION

model.fit(X_train, y_train)

### END SOLUTION

LinearRegression()

In [150]:
# Calculate the mean_squared_error and the r-squared value
# for the testing data

from sklearn.metrics import mean_squared_error, r2_score

### BEGIN SOLUTION

# Use our model to make predictions
predicted = model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print(f"mean squared error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

### END SOLUTION

mean squared error (MSE): 108.88437815619356
R-squared (R2): 0.5797436478318847


In [151]:
# Call the `score()` method on the model to show the R2 score

### BEGIN SOLUTION
model.score(X_test, y_test)
### END SOLUTION

0.5797436478318847

In [None]:
predicted