## Linear Regression

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
# read in data and store in a dataframe

df = pd.read_csv('winequality-red.csv') # change this line to your own data file as required

df.head()

In [None]:
# create a heatmap of correlation between columns

plt.figure(figsize=(8,8))
sns.set(font_scale=0.8)

mask = np.zeros_like(df.corr())
mask[np.triu_indices_from(mask)] = True

sns.heatmap(df.corr(), mask=mask, annot=True, cmap='RdYlGn', fmt='.2f', vmin=-1, vmax=1)
plt.show()

In [None]:
# seperate our data into our inputs, X, and output, y

X = df[['volatile acidity','total sulfur dioxide', 'alcohol']] # change these to column names in your own data
y = df['quality'] # change this to column name in your own data

In [None]:
# separate our X and y into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 100)

In [None]:
# create our linear regression object and train the model 

lr = LinearRegression()

lr.fit(X_train, y_train)

In [None]:
# look at the intercept and coefficients for our model

feature_df = pd.DataFrame({'intercept': lr.intercept_, 
                           'coefficients': lr.coef_,}, 
                           index = X_train.columns)

feature_df

In [None]:
# see the R^2 score for how well our model performed with our training data
# higher = better!

lr.score(X_train, y_train)

In [None]:
# and then the R^2 score for how well the model performs on our test data

lr.score(X_test, y_test)

In [None]:
# get our model to make some predictions and then see how they compare to the known answers we have
# in our test data with Mean Absolute Error (MAE)
# lower = better!

preds = lr.predict(X_test)

mean_absolute_error(preds, y_test)

In [None]:
# and similar with Root Mean Squared Error (RMSE)

np.sqrt(mean_squared_error(preds,y_test))

In [None]:
# cross-validate your model for an average R^2 score with different testing sets - still higher = better!

cross_val = cross_val_score(lr, X, y, cv = 5)

cross_val.mean()

In [None]:
# as an alternative, you can also create and fit your model with the Statsmodels package,
# and see a summary of your model results

import statsmodels.api as sm

model= sm.OLS(y, X) 

res = model.fit()

res.summary()

## Logistic Regression

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, roc_curve, roc_auc_score

In [None]:
# read in data and store in a dataframe

college = pd.read_csv('College_data.csv') # change to your own data file as required

# we create an extra column for our above or below average label - you may need to do this differently for your
# own data, or not at all if the label column you want to predict already exists in your data
college['above_average'] = college['Grad.Rate'].apply(lambda x: 1 if x >= college['Grad.Rate'].mean() else 0)

college.head()

In [None]:
# separate our data into our input columns, X, and our output column, y

X = college[['Top10perc']] # change these to column names in your own data
y = college['above_average'] # change this to a column name in your own data

In [None]:
# split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, train_size = 0.8, random_state = 100)

In [None]:
# create our logistic model and train it on our training data

logreg = LogisticRegression()

logreg.fit(X_train,y_train)

In [None]:
# if we have a new piece of data we want to predict, we can just enter it to .predict
# and get a predicted label back, 0 or 1

new_uni_top_10_perc = 22

logreg.predict([[new_uni_top_10_perc]])

In [None]:
# we can also look at the probabilities assigned to each label for that new data with
# .predict_proba, which might be easier to read in a dataframe

pd.DataFrame(logreg.predict_proba([[new_uni_top_10_perc]]), index = ["Prob. of Label"])

In [None]:
# we can create a dataframe of new points we want to feed into the model to make predictions with,
# and use .map to put some nice human-readable labels onto our 0 or 1 predicted values

new_data = [[20], [22], [25], [40]]

preds_df = pd.DataFrame({'Top10Perc': new_data, 'prediction': logreg.predict(new_data)})

preds_df['label'] = preds_df.prediction.map({0: 'Below', 1: 'Above'})

preds_df

In [None]:
# here is an example of everything from above, but this time with multiple columns in X
# notice we need to just slightly tweak how we setup the dataframe of new data we want
# to predict to have the same column names as in X

X = college[['Top10perc', 'Top25perc']] # change these to column names in your own data
y = college['above_average'] # change this to a column name in your own data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, train_size = 0.8, random_state = 100)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

new_data = pd.DataFrame({'Top10perc': [20, 22, 25, 40], 'Top25perc': [30, 32, 35, 50]}) # change these to match your columns names in X, and enter the list of new data you want to predict
new_data['predictions'] = logreg.predict(new_data)

new_data['label'] = new_data['predictions'].map({0: 'Below', 1: 'Above'})

new_data

In [None]:
# see how well our model performs as a % of correct predictions on our training data
# higher = better!

logreg.score(X_train, y_train)

In [None]:
# see how well our model performs as a % of correct predictions on our testing data
# higher = better!

logreg.score(X_test, y_test)

In [None]:
# calculate how well we could guess the label if we just randomly guessed 0 or 1 for everything
# if our model scores above are higher than this, the model is performing better than a guess!

college['above_average'].value_counts(normalize = True).max() # change dataframe and column name to your own y column

In [None]:
# create a simple confusion matrix to see the true and false positives and negatives in our model
# see where rows and columns align - 0 and 0, 1 and 1 - for true negatives and positives
# see where they don't - 0 and 1, 1 and 0 - for false negatives and positives
# higher true and lower false = better!

preds = logreg.predict(X_train)

pd.DataFrame(confusion_matrix(y_train, preds))

In [None]:
# we can also assess this with a precision score to see how often a prediction is correct
# higher = better!

precision_score(y_train, preds)

In [None]:
# and a recall score to see how often a true value is predicted correctly
# higher = better!

recall_score(y_train, preds)

In [None]:
# see a summary of these with the classification report

classification_report(y_train, preds)

In [None]:
# plot a receiver operating characteristic (ROC) curve to visually assess our model
# the further our blue line curves to the top left, the more true positives there are, the better the model
# is performing

y_pred_proba = logreg.predict_proba(X_train)[::,1]

fpr, tpr, thresholds = roc_curve(y_train,  y_pred_proba, drop_intermediate = False)

plt.plot(fpr, tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.plot([0, 1], [0, 1], color='lightgrey', lw=1, linestyle='--')
plt.show()

In [None]:
# and see this as an area under the curve (AUC-ROC) score
# higher = better!

roc_auc_score(y_train, y_pred_proba)