In [None]:
# In the following, we will go through an example of building binary classification models
# In the example, we will look at data about whether somebody was admitted to a university or not
# First we import the libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Then import the data
# We have exam results from the student, gre and gpe, and a measure of the rank of high school the student comes from

admit_data = pd.read_csv('school_data.csv')

admit_data.head()

In [None]:
# We could start with some basic data understanding, i.e. how are the varibles related to each other 
# and most importantly to admit
# without aiming to be complete, we can look at correlation
# We can see that gre and gpa seem to be moderately correlated

admit_data.corr()

In [None]:
# We tabulate admit and rank (note: rank 1 is the best)

admit_data[['rank', 'admit']].groupby(['admit', 'rank']).size()

In [None]:
# We can create box plots for the numeric columns and admit categories

sns.boxplot('admit', 'gpa', data = admit_data)

In [None]:
# At least it seems that all the variables are relevant to some extent, so we can create our model
# First create training and test set

X_train, X_test, y_train, y_test = train_test_split(admit_data[['gre', 'gpa', 'rank']], admit_data['admit'],
                                                    test_size=0.25,random_state=42) 

In [None]:
# Then fit the model to the training set

logistic_regression= LogisticRegression(solver = 'lbfgs')
logistic_regression.fit(X_train,y_train)

In [None]:
# We can create predictions for both train and test sets to compare results

y_train_predict = logistic_regression.predict(X_train)
y_test_predict = logistic_regression.predict(X_test)

In [None]:
# We can calculate the performance for both training and test set
# As we can see accuracy is lower for test set as expected, but not too much lower

report_train = metrics.classification_report(y_train, y_train_predict)

print(report_train)

report_test = metrics.classification_report(y_test, y_test_predict)

print(report_test)