# Data preparing

In [1]:
import pandas as pd
import numpy as np

In [2]:
#input data
dataset = pd.read_csv("application data.csv")
dataset.head()

Unnamed: 0,applied subject,year until now,GPA,current subject,is honor college,GMAT,GRE,english test exempted,TOFEL,IELTS,internship,result
0,Engineering - Civil and Structural,2,3.72,Engineering - Civil and Structural,1,796.09,305.28,0,0.0,8.14,1,0
1,Modern Languages,5,2.48,"Theology, Divinity and Religious Studies",0,0.0,192.8,1,0.0,0.0,1,1
2,Geophysics,10,3.32,Geophysics,1,0.0,308.7,0,0.0,7.38,1,1
3,Business and Management Studies,10,3.65,Materials Sciences,1,763.28,0.0,1,0.0,0.0,1,0
4,English Language and Literature,5,3.86,Computer Science and Information Systems,1,0.0,328.85,0,0.0,6.4,1,1


In [3]:
# display the shape of the dataset
dataset.shape

(25000, 12)

In [112]:
dataset.describe()

Unnamed: 0,year until now,GPA,is honor college,GMAT,GRE,english test exempted,TOFEL,IELTS,internship,result
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,5.00636,3.285474,0.65176,338.318789,196.238197,0.5044,25.022915,2.322859,0.651,0.64544
std,3.161531,0.483219,0.476422,352.990901,145.730378,0.499991,43.646477,3.501595,0.476663,0.478389
min,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,3.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,3.33,1.0,0.0,279.87,1.0,0.0,0.0,1.0,1.0
75%,8.0,3.66,1.0,733.02,320.08,1.0,80.085,6.58,1.0,1.0
max,10.0,4.0,1.0,799.98,340.0,1.0,120.0,9.0,1.0,1.0


In [113]:
#check missing value
dataset.isnull().sum()

applied subject          0
year until now           0
GPA                      0
current subject          0
is honor college         0
GMAT                     0
GRE                      0
english test exempted    0
TOFEL                    0
IELTS                    0
internship               0
result                   0
dtype: int64

# Data processing

In [4]:
#set target and feature
#y is target (result) that we want to predict
y = dataset['result']
#x is feature that we remove the result column
x_oldest = dataset.drop(labels=['result'], axis=1)

In [5]:
x_oldest.head()

Unnamed: 0,applied subject,year until now,GPA,current subject,is honor college,GMAT,GRE,english test exempted,TOFEL,IELTS,internship
0,Engineering - Civil and Structural,2,3.72,Engineering - Civil and Structural,1,796.09,305.28,0,0.0,8.14,1
1,Modern Languages,5,2.48,"Theology, Divinity and Religious Studies",0,0.0,192.8,1,0.0,0.0,1
2,Geophysics,10,3.32,Geophysics,1,0.0,308.7,0,0.0,7.38,1
3,Business and Management Studies,10,3.65,Materials Sciences,1,763.28,0.0,1,0.0,0.0,1
4,English Language and Literature,5,3.86,Computer Science and Information Systems,1,0.0,328.85,0,0.0,6.4,1


In [6]:
y.head()

0    0
1    1
2    1
3    0
4    1
Name: result, dtype: int64

In [7]:
# Transform categorical data
import category_encoders as ce

In [8]:
# applied subject and current subject is category column, convert it to count encoding
count_encoder = ce.CountEncoder(cols=["applied subject", "current subject"])

# x is the dataset with category column converted
x = count_encoder.fit_transform(x_oldest)
x

Unnamed: 0,applied subject,year until now,GPA,current subject,is honor college,GMAT,GRE,english test exempted,TOFEL,IELTS,internship
0,429,2,3.72,430,1,796.09,305.28,0,0.00,8.14,1
1,421,5,2.48,416,0,0.00,192.80,1,0.00,0.00,1
2,405,10,3.32,397,1,0.00,308.70,0,0.00,7.38,1
3,471,10,3.65,410,1,763.28,0.00,1,0.00,0.00,1
4,413,5,3.86,402,1,0.00,328.85,0,0.00,6.40,1
...,...,...,...,...,...,...,...,...,...,...,...
24995,421,9,3.02,424,1,0.00,326.89,0,0.00,7.08,0
24996,410,7,3.31,457,1,0.00,338.98,0,111.04,0.00,1
24997,420,9,3.23,410,1,0.00,336.70,1,0.00,0.00,1
24998,420,1,3.85,428,1,0.00,316.20,1,0.00,0.00,0


In [9]:
# a different encoding approach
one_hot_encoder = ce.OneHotEncoder(cols=["applied subject", "current subject"])
x = one_hot_encoder.fit_transform(x_oldest)
x

Unnamed: 0,applied subject_1,applied subject_2,applied subject_3,applied subject_4,applied subject_5,applied subject_6,applied subject_7,applied subject_8,applied subject_9,applied subject_10,...,current subject_57,current subject_58,current subject_59,is honor college,GMAT,GRE,english test exempted,TOFEL,IELTS,internship
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,796.09,305.28,0,0.00,8.14,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0.00,192.80,1,0.00,0.00,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0.00,308.70,0,0.00,7.38,1
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,763.28,0.00,1,0.00,0.00,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0.00,328.85,0,0.00,6.40,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0.00,326.89,0,0.00,7.08,0
24996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0.00,338.98,0,111.04,0.00,1
24997,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0.00,336.70,1,0.00,0.00,1
24998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0.00,316.20,1,0.00,0.00,0


In [10]:
# Scale the data

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

scaler = StandardScaler()
cols_to_scale = ["GPA", "GMAT", "GRE", "TOFEL", "IELTS"]
x_scaled = x.copy()
x_scaled[cols_to_scale] = scaler.fit_transform(x_scaled[cols_to_scale])

x_scaled

Unnamed: 0,applied subject_1,applied subject_2,applied subject_3,applied subject_4,applied subject_5,applied subject_6,applied subject_7,applied subject_8,applied subject_9,applied subject_10,...,current subject_57,current subject_58,current subject_59,is honor college,GMAT,GRE,english test exempted,TOFEL,IELTS,internship
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1.283113,0.751219,0,-0.571637,1.635337,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,-0.970448,-0.019544,1,-0.571637,-0.675794,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,-0.970448,0.774654,0,-0.571637,1.419555,1
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1.190235,-1.340696,1,-0.571637,-0.675794,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,-0.970448,0.912731,0,-0.571637,1.141311,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,-0.970448,0.899300,0,-0.571637,1.334378,0
24996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,-0.970448,0.982146,0,1.969994,-0.675794,1
24997,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,-0.970448,0.966523,1,-0.571637,-0.675794,1
24998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,-0.970448,0.826047,1,-0.571637,-0.675794,0


In [11]:
#split dataset into training and testing set
from sklearn.model_selection import train_test_split

In [12]:
train_x, test_x, train_y, test_y = train_test_split(x_scaled, y, train_size=0.8, random_state=1) 

In [13]:
print('Training Features Shape:', train_x.shape)
print('Training Labels Shape:', train_y.shape)
print('Testing Features Shape:', test_x.shape)
print('Testing Labels Shape:', test_y.shape)

Training Features Shape: (20000, 127)
Training Labels Shape: (20000,)
Testing Features Shape: (5000, 127)
Testing Labels Shape: (5000,)


# Random Forest Classifier

In [149]:
#define the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [150]:
#training model
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model.fit(train_x, train_y)

#predict data
test_y_predicted = rf_model.predict(test_x)
print(test_y_predicted)

[0 1 1 ... 1 1 0]


In [151]:
accuracy = accuracy_score(test_y, test_y_predicted)
print(accuracy)


0.7866


# Random Forest Regressor

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(train_x, train_y)

#predict data
test_y_predicted = rf_model.predict(test_x)
print(test_y_predicted)

[0.9  0.19 0.2  ... 0.19 0.21 0.88]


In [16]:
# Evaluate
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test_y, test_y_predicted)
print("Mean Squared Error: ", mse)

Mean Squared Error:  0.17397164


In [17]:
# Save the model
import joblib

joblib.dump(rf_model, 'random_forest_regression_model.joblib')
joblib.dump(one_hot_encoder, 'model_encoder.joblib')
joblib.dump(scaler, 'model_scaler.joblib')

['model_scaler.joblib']

# new

In [18]:
input = ['Performing Arts', 3, 2.2, 'Performing Arts', 0, 0, 220, 0, 75, 0, 0]
fields = ['applied subject', 'year until now', 'GPA', 'current subject', 'is honor college', 'GMAT', 'GRE', 'english test exempted', 'TOFEL', 'IELTS', 'internship']

if not (len(input) == len(fields)):
    raise ValueError("input size doesn't fit field size")
else:
    input = [{fields[i]: input[i] for i in range(len(input))}]

encoder = joblib.load("../static/model_encoder.joblib")
scaler = joblib.load("../static/model_scaler.joblib")
model = joblib.load("../static/random_forest_regression_model.joblib")

x = encoder.transform(input)
cols_to_scale = ["GPA", "GMAT", "GRE", "TOFEL", "IELTS"]
x_scaled = x.copy()
x_scaled[cols_to_scale] = scaler.transform(x_scaled[cols_to_scale])
predicted = model.predict(x_scaled)


In [19]:
print(predicted)

[0.3]


Bad pipe message: %s [b'FV{j45K\x00\xcc}\xaf\xa8\xcb|I\xd09\x10 \xaa=m\xf3\x01\xf1/]m\xbb\xd8\xe1]R\xcd\x02\xb8\xc1F\xc3\x9f\xd9\x10}\x8f\xecJ\xf7\xe0\xad\r\xe1\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r']
Bad pipe message: %s [b'E\x83\x0e\x85\xfc\x12=']
Bad pipe message: %s [b'\xbaqO!5\xd9?\xa5\x16\xe3e7>\x19R\x7f9\xba\x00\x00\xa2\xc0\x14\xc0\n\x009\x008\x007\x006\x00\x88\x00\x87\x00\x86\x00\x85\xc0\x19\x00:\x00\x89\xc0\x0f\xc0\x05\x005\x00\x84\xc0\x13\xc0\t\x003\x002\x001\x000\x00\x9a\x00\x99\x00\x98\x00\x97\x00E\x00D\x00C\x00B\xc0\x18\x004\x00\x9b\x00F\xc0\x0e\xc0\x04\x00/\x00\x96\x00A\x00\x07\xc0\x11\xc0\x07\xc0\x16\x00\x18\xc0\x0c\xc0\x02\x00\x05\x00\x04\xc0\x12\xc0\x08\x00\x16\x00\x13\x00\x10\x00\r\xc0\x17\x00\x1b']
Bad pipe message: %s [b"d\x13\x1a'\xd3\x04\xf6\x917\x1a\xe9\xd2\n\xc0