In [1]:
#importing libraries and dependencies
import sys
import os
import collections
from collections import defaultdict
import numpy as np
import pandas as pd
from scipy import stats
import re
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import nltk

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
#importing data
df = pd.read_csv('clean_sample_50k.csv')
df.drop(columns = ['Unnamed: 0'], axis = 1, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54940 entries, 0 to 54939
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   University   54940 non-null  object 
 1   Major        54938 non-null  object 
 2   Program      54940 non-null  object 
 3   Citizenship  54940 non-null  object 
 4   Semester     54940 non-null  object 
 5   Year         54940 non-null  int64  
 6   Q_Score      54940 non-null  int64  
 7   V_Score      54940 non-null  int64  
 8   AWA_Score    54940 non-null  float64
 9   GPA_Score    54940 non-null  float64
 10  Status       54939 non-null  object 
dtypes: float64(2), int64(3), object(6)
memory usage: 4.6+ MB


In [4]:
df['Program'].value_counts()

PhD        28506
Masters    26385
MBA           26
EdD           18
JD             3
IND            2
Name: Program, dtype: int64

In [5]:
data1 = df[df['Program'].str.contains("PhD")==True]
data2 = df[df['Program'].str.contains("Masters")==True]

In [6]:
frames = [data1, data2]
  
df = pd.concat(frames,ignore_index=True)

In [7]:
df['Program'].value_counts()

PhD        28506
Masters    26385
Name: Program, dtype: int64

In [8]:
df.columns

Index(['University', 'Major', 'Program', 'Citizenship', 'Semester', 'Year',
       'Q_Score', 'V_Score', 'AWA_Score', 'GPA_Score', 'Status'],
      dtype='object')

In [9]:
df.shape

(54891, 11)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54891 entries, 0 to 54890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   University   54891 non-null  object 
 1   Major        54889 non-null  object 
 2   Program      54891 non-null  object 
 3   Citizenship  54891 non-null  object 
 4   Semester     54891 non-null  object 
 5   Year         54891 non-null  int64  
 6   Q_Score      54891 non-null  int64  
 7   V_Score      54891 non-null  int64  
 8   AWA_Score    54891 non-null  float64
 9   GPA_Score    54891 non-null  float64
 10  Status       54890 non-null  object 
dtypes: float64(2), int64(3), object(6)
memory usage: 4.6+ MB


In [11]:
#FILTERING OUT ONLY ACCEPTED INSTANCES FOR OUR DATA
df = df[df['Status'] == 'Accepted']
df.reset_index(drop = True, inplace = True)
df.head()

Unnamed: 0,University,Major,Program,Citizenship,Semester,Year,Q_Score,V_Score,AWA_Score,GPA_Score,Status
0,University Of Minnesota,Pure Mathematics,PhD,American,Fall,2022,168,164,4.0,3.8,Accepted
1,University Of Tennessee Knoxville,Computer Engineering,PhD,International,Fall,2022,162,145,4.0,3.7,Accepted
2,The University Of Texas At San Antonio,Business Administration,PhD,International,Fall,2022,157,146,4.5,3.6,Accepted
3,University Of New Mexico,Mechanical Engineering,PhD,International,Fall,2022,165,152,3.0,3.6,Accepted
4,University Of Virginia,Education Policy,PhD,International,Fall,2022,166,166,4.5,4.0,Accepted


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27777 entries, 0 to 27776
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   University   27777 non-null  object 
 1   Major        27776 non-null  object 
 2   Program      27777 non-null  object 
 3   Citizenship  27777 non-null  object 
 4   Semester     27777 non-null  object 
 5   Year         27777 non-null  int64  
 6   Q_Score      27777 non-null  int64  
 7   V_Score      27777 non-null  int64  
 8   AWA_Score    27777 non-null  float64
 9   GPA_Score    27777 non-null  float64
 10  Status       27777 non-null  object 
dtypes: float64(2), int64(3), object(6)
memory usage: 2.3+ MB


In [13]:
#CREATING PROCESSED DATA
processed_data = df[['Major','Program','Q_Score', 'V_Score', 'AWA_Score', 'GPA_Score', 'University']]
processed_data

Unnamed: 0,Major,Program,Q_Score,V_Score,AWA_Score,GPA_Score,University
0,Pure Mathematics,PhD,168,164,4.0,3.8,University Of Minnesota
1,Computer Engineering,PhD,162,145,4.0,3.7,University Of Tennessee Knoxville
2,Business Administration,PhD,157,146,4.5,3.6,The University Of Texas At San Antonio
3,Mechanical Engineering,PhD,165,152,3.0,3.6,University Of New Mexico
4,Education Policy,PhD,166,166,4.5,4.0,University Of Virginia
...,...,...,...,...,...,...,...
27772,Mechanical Engineering,Masters,170,162,4.5,3.2,NYU Tandon School Of Engineering
27773,Mechanical Engineering(Mechatronics And Robotics),Masters,170,154,3.5,3.5,Politecnico Di Milano (Polimi)
27774,Mechanical Engineering(Biomechanical Design),Masters,170,154,3.5,3.5,TU Delft Netherlands
27775,MSc Economics,Masters,166,161,4.5,3.8,UCL(University College London)


In [14]:
#CONVERTING GPA FROM 10 POINT SCALE TO 4 POINT SCALE
processed_data["GPA_Score"].replace({0.0:3.8 , 8.8:3.6 , 10.0:4.0 , 9.9:3.9 , 9.8:3.9 , 9.7:3.9 ,9.6:3.8 ,9.5:3.8 , 9.4:3.8 , 9.3:3.7, 9.2:3.7 ,9.1:3.7, 9.0:3.6 , 8.9:3.6 , 8.7:3.5 , 8.6:3.5 , 8.5:3.5 ,8.4:3.4 , 8.3:3.4 , 8.2:3.4 , 8.1:3.3 , 8.0:3.3 , 7.9:3.3 , 7.8:3.2 , 7.7:3.2 , 7.6:3.2 , 7.5:3.1 , 7.4:3.1 , 7.3:3.1 ,7.2:3.0 ,7.1:3.0 ,6.9:3.0 ,6.8:2.9 ,6.7:2.9 ,6.6:2.8 ,6.5:2.8 , 6.4:2.7 , 6.3:2.7 , 6.2:2.6 ,6.1:2.6 ,6.0:2.5 ,5.9:2.5 ,5.8:2.4, 5.7:2.4 , 5.6:2.3 ,5.5:2.3 ,5.4:2.2 ,5.3:2.2 ,5.2:2.1,5.1:2.1 }, inplace=True)

In [15]:
processed_data['GPA_Score'].value_counts()

3.8    7229
4.0    4616
3.9    4350
3.7    2896
3.6    2652
3.5    1872
3.4    1273
3.3     934
3.2     809
3.1     394
3.0     367
2.8     107
2.9     102
2.7      42
2.5      33
2.6      32
4.1      15
4.2       9
4.3       8
2.4       8
2.2       4
4.5       4
2.1       3
4.8       3
2.3       3
4.6       3
7.0       2
2.0       2
1.3       2
4.7       1
4.4       1
0.8       1
Name: GPA_Score, dtype: int64

In [16]:
#LOOKING FOR NAN VALUES
nan_rows = processed_data[processed_data['Major'].isnull()]
nan_rows

Unnamed: 0,Major,Program,Q_Score,V_Score,AWA_Score,GPA_Score,University
9930,,PhD,162,151,3.0,3.8,University of Illinois


In [17]:
#DROPPING NAN VALUES
processed_data.drop(processed_data.iloc[9930].name,  inplace=True)

In [18]:
processed_data

Unnamed: 0,Major,Program,Q_Score,V_Score,AWA_Score,GPA_Score,University
0,Pure Mathematics,PhD,168,164,4.0,3.8,University Of Minnesota
1,Computer Engineering,PhD,162,145,4.0,3.7,University Of Tennessee Knoxville
2,Business Administration,PhD,157,146,4.5,3.6,The University Of Texas At San Antonio
3,Mechanical Engineering,PhD,165,152,3.0,3.6,University Of New Mexico
4,Education Policy,PhD,166,166,4.5,4.0,University Of Virginia
...,...,...,...,...,...,...,...
27772,Mechanical Engineering,Masters,170,162,4.5,3.2,NYU Tandon School Of Engineering
27773,Mechanical Engineering(Mechatronics And Robotics),Masters,170,154,3.5,3.5,Politecnico Di Milano (Polimi)
27774,Mechanical Engineering(Biomechanical Design),Masters,170,154,3.5,3.5,TU Delft Netherlands
27775,MSc Economics,Masters,166,161,4.5,3.8,UCL(University College London)


In [19]:
#GROUPING DIFFERENT SUBJECTS UNDER A BIG MAJOR
conditions = [
    (processed_data['Major'].str.contains('Comput', case=False)),
    (processed_data['Major'].str.contains('Cybersecurity', case=False)),
    (processed_data['Major'].str.contains('Data Science', case=False)),
    (processed_data['Major'].str.contains('Software Engineering', case=False)),
    (processed_data['Major'].str.contains('Artificial Intelligence', case=False)),
    (processed_data['Major'].str.contains('Data Analytics', case=False)),
    (processed_data['Major'].str.contains('Physics')),
    (processed_data['Major'].str.contains('Chemistry')),
    (processed_data['Major'].str.contains('Mathematics', case=False)),
    (processed_data['Major'].str.contains('Sociology')),
    (processed_data['Major'].str.contains('Engineering', case=False)),
    (processed_data['Major'].str.contains('English')),
    (processed_data['Major'].str.contains('Economics')),
    (processed_data['Major'].str.contains('Science')),
    (processed_data['Major'].str.contains('English')),
    (processed_data['Major'].str.contains('Speech Language Pathology')) 
    ]
values = ['Computer Science','Computer Science','Computer Science','Computer Science','Computer Science','Computer Science','Physics', 'Chemistry', 'Mathematics','Sociology','Engineering','English','Economics','Science','English','Speech Language Pathology']
processed_data["Major_grp"] = np.select(conditions, values)
processed_data

Unnamed: 0,Major,Program,Q_Score,V_Score,AWA_Score,GPA_Score,University,Major_grp
0,Pure Mathematics,PhD,168,164,4.0,3.8,University Of Minnesota,Mathematics
1,Computer Engineering,PhD,162,145,4.0,3.7,University Of Tennessee Knoxville,Computer Science
2,Business Administration,PhD,157,146,4.5,3.6,The University Of Texas At San Antonio,0
3,Mechanical Engineering,PhD,165,152,3.0,3.6,University Of New Mexico,Engineering
4,Education Policy,PhD,166,166,4.5,4.0,University Of Virginia,0
...,...,...,...,...,...,...,...,...
27772,Mechanical Engineering,Masters,170,162,4.5,3.2,NYU Tandon School Of Engineering,Engineering
27773,Mechanical Engineering(Mechatronics And Robotics),Masters,170,154,3.5,3.5,Politecnico Di Milano (Polimi),Engineering
27774,Mechanical Engineering(Biomechanical Design),Masters,170,154,3.5,3.5,TU Delft Netherlands,Engineering
27775,MSc Economics,Masters,166,161,4.5,3.8,UCL(University College London),Economics


In [20]:
#FILLING OTHER VALUES FOR MAJOR GRPS WITH RESP3CTIVE MAJOR NAMES
processed_data['Major_grp'] = np.where(processed_data['Major_grp'] == '0',processed_data['Major'],processed_data['Major_grp'])

In [21]:
#DROPPING ALL THE MAJORS WITH VALUE CONT LESS THAN 2
processed_data=processed_data.groupby('Major').filter(lambda x : len(x)>2)
processed_data.reset_index(drop = True, inplace = True)

In [22]:
processed_data

Unnamed: 0,Major,Program,Q_Score,V_Score,AWA_Score,GPA_Score,University,Major_grp
0,Pure Mathematics,PhD,168,164,4.0,3.8,University Of Minnesota,Mathematics
1,Computer Engineering,PhD,162,145,4.0,3.7,University Of Tennessee Knoxville,Computer Science
2,Business Administration,PhD,157,146,4.5,3.6,The University Of Texas At San Antonio,Business Administration
3,Mechanical Engineering,PhD,165,152,3.0,3.6,University Of New Mexico,Engineering
4,Education Policy,PhD,166,166,4.5,4.0,University Of Virginia,Education Policy
...,...,...,...,...,...,...,...,...
23897,Master Of Statistics,Masters,162,169,4.5,4.0,KU Leuven,Master Of Statistics
23898,Anthropology,Masters,155,147,4.0,3.9,Georgia State (GSU),Anthropology
23899,Mechanical Engineering,Masters,170,162,4.5,3.2,NYU Tandon School Of Engineering,Engineering
23900,MSc Economics,Masters,166,161,4.5,3.8,UCL(University College London),Economics


In [23]:
processed_data['Major'].value_counts()[0:50]

Computer Science                               2582
Speech Language Pathology                      1816
Economics                                      1428
Chemistry                                       934
Physics                                         865
Mechanical Engineering                          699
Electrical And Computer Engineering             608
Statistics                                      526
Political Science                               489
Chemical Engineering                            476
Philosophy                                      406
Sociology                                       397
( ECE ) Electrical And Computer Engineering     307
Electrical Engineering                          298
Data Science                                    260
Pure Mathematics                                257
Public Policy                                   253
Mathematics                                     252
Materials Science And Engineering               240
Biostatistic

In [24]:
#sCALING OUR FEATURES
scaled_processed_data = processed_data.copy()
col_names = ['Q_Score', 'V_Score', 'AWA_Score','GPA_Score']
features = processed_data[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
scaled_processed_data[col_names] = features
scaled_processed_data

Unnamed: 0,Major,Program,Q_Score,V_Score,AWA_Score,GPA_Score,University,Major_grp
0,Pure Mathematics,PhD,0.805513,0.774614,-0.313521,0.309808,University Of Minnesota,Mathematics
1,Computer Engineering,PhD,-0.014943,-2.221593,-0.313521,-0.074510,University Of Tennessee Knoxville,Computer Science
2,Business Administration,PhD,-0.698657,-2.063898,0.171960,-0.458828,The University Of Texas At San Antonio,Business Administration
3,Mechanical Engineering,PhD,0.395285,-1.117727,-1.284483,-0.458828,University Of New Mexico,Engineering
4,Education Policy,PhD,0.532028,1.090004,0.171960,1.078445,University Of Virginia,Education Policy
...,...,...,...,...,...,...,...,...
23897,Master Of Statistics,Masters,-0.014943,1.563090,0.171960,1.078445,KU Leuven,Master Of Statistics
23898,Anthropology,Masters,-0.972143,-1.906203,-0.313521,0.694127,Georgia State (GSU),Anthropology
23899,Mechanical Engineering,Masters,1.078999,0.459224,0.171960,-1.996100,NYU Tandon School Of Engineering,Engineering
23900,MSc Economics,Masters,0.532028,0.301529,0.171960,0.309808,UCL(University College London),Economics


In [25]:
X=scaled_processed_data.drop(['Major','University','Major_grp','Program'],axis=1)
y=scaled_processed_data['University']

In [26]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=0)

In [27]:
classifier = KNeighborsClassifier(n_neighbors =1000, metric = 'euclidean')
classifier.fit(X, y)

In [92]:
testSet = [['Computer Science','Masters',150, 150, 4.0 ,3.5]]

In [93]:
#PREPARING TESTSET INTO REQUIRED FORMAT
def testpreprocessing(testSet):
    test = pd.DataFrame(testSet)
    test.columns =['Major', 'Program','Q_Score', 'V_Score', 'AWA_Score','GPA_Score']
    scaled_processed_test_data = test.copy()
    col_names = ['Q_Score', 'V_Score', 'AWA_Score','GPA_Score']
    features = test[col_names]
    features = scaler.transform(features.values)
    scaled_processed_test_data[col_names] = features
    return scaled_processed_test_data

processed_test_data=testpreprocessing(testSet)
processed_test_data

Unnamed: 0,Major,Program,Q_Score,V_Score,AWA_Score,GPA_Score
0,Computer Science,Masters,-1.655856,-1.433118,-0.313521,-0.843146


In [94]:
processed_test_data.loc[:1,]

Unnamed: 0,Major,Program,Q_Score,V_Score,AWA_Score,GPA_Score
0,Computer Science,Masters,-1.655856,-1.433118,-0.313521,-0.843146


In [95]:
def recomendor(data):
    Subject=data['Major']
    col_names = ['Q_Score', 'V_Score', 'AWA_Score','GPA_Score']
    features = data[col_names]
    neighbors=classifier.kneighbors(features,return_distance=False)
    neighbors=neighbors.flatten()
    neighbors=neighbors.tolist()
    res = pd.DataFrame(neighbors,columns=['Index'])
    maj=[]
    uni=[]
    Q_Score=[]
    V_Score=[]
    AWA_Score=[]
    GPA_Score=[]
    Major_grp=[]
    Program=[]
    k=1000
    for i in range(0,k):
        uni.append(processed_data['University'].loc[res['Index'].loc[i]])
        maj.append(processed_data['Major'].loc[res['Index'].loc[i]])
        Q_Score.append(processed_data['Q_Score'].loc[res['Index'].loc[i]])
        V_Score.append(processed_data['V_Score'].loc[res['Index'].loc[i]])
        AWA_Score.append(processed_data['AWA_Score'].loc[res['Index'].loc[i]])
        GPA_Score.append(processed_data['GPA_Score'].loc[res['Index'].loc[i]])
        Major_grp.append(processed_data['Major_grp'].loc[res['Index'].loc[i]])
        Program.append(processed_data['Program'].loc[res['Index'].loc[i]])
    res['Major'] = maj
    res['University'] = uni
    res['Q_Score'] = Q_Score
    res['V_Score'] = V_Score
    res['AWA_Score'] = AWA_Score
    res['GPA_Score'] = GPA_Score
    res['Major_grp']=Major_grp
    res['Program']=Program
    result1 = res.loc[res['Major_grp'].isin(data['Major'])]
    result = result1.loc[res['Program'].isin(data['Program'])]
    return result

In [96]:
result=recomendor(processed_test_data)
result

Unnamed: 0,Index,Major,University,Q_Score,V_Score,AWA_Score,GPA_Score,Major_grp,Program
119,10923,Artificial Intelligence,San Jose State University,154,148,4.0,3.5,Computer Science,Masters
120,11708,Computer Science,Santa Clara University,154,148,4.0,3.5,Computer Science,Masters
121,11380,Computer Science,California State University Easy Bay,154,148,4.0,3.5,Computer Science,Masters
122,11594,Computer Science,San Diego State University,154,148,4.0,3.5,Computer Science,Masters
201,18921,Data Science,George Washington University,155,152,4.0,3.5,Computer Science,Masters
406,16766,Health Data Science,Dartmouth College,155,149,3.5,3.6,Computer Science,Masters
534,17290,Electrical And Computer Engineering,UC San Diego (UCSD),155,151,4.0,3.7,Computer Science,Masters
559,10450,Computer Science,TU Delft Netherlands,154,149,3.5,3.7,Computer Science,Masters
562,10283,Computer Science,Uppsala University,154,149,3.5,3.7,Computer Science,Masters
661,11685,Computer Science,George Mason University,157,150,3.5,3.4,Computer Science,Masters


In [97]:
#result.to_csv('result.csv')

In [98]:
University=result['University'].tolist()
final=[]
for i in range(len(University)):
    if University[i] not in final:
        final.append(University[i])  
    if len(final)==15:
        break
final=[re.sub(r"[^a-zA-Z0-9()]"," ", _) for _ in final]
final

[' San Jose State University',
 ' Santa Clara University',
 ' California State University Easy Bay',
 ' San Diego State University',
 ' George Washington University',
 ' Dartmouth College',
 ' UC San Diego (UCSD)',
 ' TU Delft Netherlands',
 ' Uppsala University',
 ' George Mason University',
 ' Rochester Institute Of Technology (RIT)',
 ' Purdue University   West Lafayette',
 ' Georgia Institute Of Technology',
 ' Washington State University (pullman)',
 ' University Of Waterloo']