In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = [12, 8]

In [None]:
df = pd.read_csv('../../datasets/usa_election_dataset.csv')

In [None]:
df[:5]

## linear regression (simple / multiple) on turn out rate

In [None]:
print(f"Average turn out rate: {round(df['turnout'].mean() * 100, 3)}%")
print(f"Std of turn out rate: {round(df['turnout'].std(),  3) * 100}%")

In [None]:
indepedent_variables = ['yougn', 'female', 'black']
# indepedent_variables = ['yougn']
X = df[indepedent_variables].values
if len(indepedent_variables) == 1:
    X = np.reshape(X, (-1, 1))
y = df['turnout'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=42)

regmodel = LinearRegression()
regmodel.fit(X_train, y_train)
y_predict = regmodel.predict(X_test)

In [None]:
print('Model fits:')
print(f'- intercept: {round(regmodel.intercept_, 3)}')
for i in range(len(indepedent_variables)):
    print(f'- {indepedent_variables[i]}: {round(regmodel.coef_[i], 3)}')
print('Model stats:')
print(f'- error: {round(mean_absolute_error(y_test, y_predict), 5)}')
print(f'- r2: {round(r2_score(y_test, y_predict), 5)}')

In [None]:
for i in range(len(indepedent_variables)):
    plt.scatter([x[i] for x in X_test], y_test,  color='black')
    plt.scatter([x[i] for x in X_test], y_predict, color='blue')
    plt.show()