In [48]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [49]:
df = pd.read_csv(Path('AllCounties2020.csv'))
df.head()

Unnamed: 0,Month,County,Population,Avg_Monthly_Wages,Avg_HV
0,1/1/2020,Alameda,1679844,6736.0,904671.0
1,1/1/2020,Alpine,1198,3547.0,423354.0
2,1/1/2020,Amador,40541,3864.0,325168.0
3,1/1/2020,Butte,210083,3750.0,340989.0
4,1/1/2020,Calaveras,45327,3494.0,334012.0


In [50]:
df.count()

Month                696
County               696
Population           696
Avg_Monthly_Wages    696
Avg_HV               680
dtype: int64

In [51]:
df = df.dropna()

In [52]:
df.count()

Month                680
County               680
Population           680
Avg_Monthly_Wages    680
Avg_HV               680
dtype: int64

In [53]:
df['Outcome'] = ""

In [54]:
df

Unnamed: 0,Month,County,Population,Avg_Monthly_Wages,Avg_HV,Outcome
0,1/1/2020,Alameda,1679844,6736.0,904671.0,
1,1/1/2020,Alpine,1198,3547.0,423354.0,
2,1/1/2020,Amador,40541,3864.0,325168.0,
3,1/1/2020,Butte,210083,3750.0,340989.0,
4,1/1/2020,Calaveras,45327,3494.0,334012.0,
...,...,...,...,...,...,...
690,12/1/2020,Trinity,12869,3436.0,287656.0,
691,12/1/2020,Tulare,372792,3496.0,257172.0,
693,12/1/2020,Ventura,765840,4880.0,664960.0,
694,12/1/2020,Yolo,174879,5056.0,509539.0,


In [55]:
df['Avg_HV'].mean()

490340.0073529412

In [56]:
df['Avg_HV'].median()

376990.5

In [57]:
df['Outcome'] = df['Avg_HV'].apply(lambda x: 0 if x <= 400000 else 1)

In [58]:
df

Unnamed: 0,Month,County,Population,Avg_Monthly_Wages,Avg_HV,Outcome
0,1/1/2020,Alameda,1679844,6736.0,904671.0,1
1,1/1/2020,Alpine,1198,3547.0,423354.0,1
2,1/1/2020,Amador,40541,3864.0,325168.0,0
3,1/1/2020,Butte,210083,3750.0,340989.0,0
4,1/1/2020,Calaveras,45327,3494.0,334012.0,0
...,...,...,...,...,...,...
690,12/1/2020,Trinity,12869,3436.0,287656.0,0
691,12/1/2020,Tulare,372792,3496.0,257172.0,0
693,12/1/2020,Ventura,765840,4880.0,664960.0,1
694,12/1/2020,Yolo,174879,5056.0,509539.0,1


In [59]:
y = df["Outcome"]
X = df.drop(columns=["Month","County","Avg_HV","Outcome"])

In [60]:
y

0      1
1      1
2      0
3      0
4      0
      ..
690    0
691    0
693    1
694    1
695    0
Name: Outcome, Length: 680, dtype: int64

In [61]:
X

Unnamed: 0,Population,Avg_Monthly_Wages
0,1679844,6736.0
1,1198,3547.0
2,40541,3864.0
3,210083,3750.0
4,45327,3494.0
...,...,...
690,12869,3436.0
691,372792,3496.0
693,765840,4880.0
694,174879,5056.0


In [62]:
y.shape

(680,)

In [63]:
X.shape

(680, 2)

In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(510, 2)

In [65]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [66]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [67]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,1,1
2,1,1
3,0,0
4,1,0
5,0,0
6,0,1
7,0,0
8,1,0
9,0,0


In [68]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.711764705882353


In [69]:
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[67 22]
 [27 54]]


In [70]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.71      0.75      0.73        89
           1       0.71      0.67      0.69        81

    accuracy                           0.71       170
   macro avg       0.71      0.71      0.71       170
weighted avg       0.71      0.71      0.71       170

