In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
import plotly.graph_objects as go
import matplotlib as mpl
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.express as px
from plotly import tools
from plotly.subplots import make_subplots
from plotly.offline import iplot

from scipy import stats
from scipy.stats import shapiro
from scipy.stats import chi2_contingency
from scipy.stats import chi2

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import metrics

[Датасет](https://www.kaggle.com/datasets/mukeshmanral/graduates-admission-prediction)

In [4]:
df = pd.read_csv("Admission_Predict_Ver1.1.csv")

*   GRE Score: The Graduate Record Examinations is a standardized test that is an admissions requirement for many graduate schools in the United States and Canada.
*   TOEFL Score: Score in TOEFL exam.
*   University Rating: Student undergraduate university ranking.
*   SOP: Statement of Purpose strength.
*   LOR: Letter of Recommendation strength.
*   CGPA: Undergraduate GPA.
*   Research: Whether student has research experience or not.
*   Chance of Admit: Admission chance.

In [5]:
#Убираем пробелы в названиях фичей
del df['Serial No.']
df.columns = df.columns.str.replace(' ', '')
df.columns

Index(['GREScore', 'TOEFLScore', 'UniversityRating', 'SOP', 'LOR', 'CGPA',
       'Research', 'ChanceofAdmit'],
      dtype='object')

In [6]:
# Посмотрим на пропущенные значения
total = df.isnull().sum().sort_values(ascending = False)
perc = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, perc], axis=1, keys=['Всего', 'В процентах'])

Unnamed: 0,Всего,В процентах
GREScore,0,0.0
TOEFLScore,0,0.0
UniversityRating,0,0.0
SOP,0,0.0
LOR,0,0.0
CGPA,0,0.0
Research,0,0.0
ChanceofAdmit,0,0.0


Их нет

In [7]:
# Посмотрим дубликаты
df[df.duplicated()]

Unnamed: 0,GREScore,TOEFLScore,UniversityRating,SOP,LOR,CGPA,Research,ChanceofAdmit


Их тоже нет

## Анализ фичей по одной

In [8]:
def percent_value_counts(df, feature):
    percent = pd.DataFrame(round(df.loc[:,feature].value_counts(dropna=False, normalize=True)*100,2))
    total = pd.DataFrame(df.loc[:,feature].value_counts(dropna=False))

    total.columns = ["total"]
    percent.columns = ['percent']
    return pd.concat([total, percent], axis = 1)

In [9]:
fig = make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(y = df['GREScore'].value_counts().values.tolist(),
                      x = df['GREScore'].value_counts().index,
                      text=df['GREScore'].value_counts().values.tolist(),
              textfont=dict(size=15),
                      textposition = 'outside',
                      showlegend=False,
              marker = dict(color = '#19e6e6',
                            line_color = 'black',
                            line_width=3)),row = 1,col = 1)

fig.update_layout(title='Гистограмма GRE Score',
                  template='plotly_white')
fig.update_yaxes(range=[0,25])
fig.show()

In [10]:
fig = make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(y = df['TOEFLScore'].value_counts().values.tolist(),
                      x = df['TOEFLScore'].value_counts().index,
                      text=df['TOEFLScore'].value_counts().values.tolist(),
              textfont=dict(size=15),
                      textposition = 'outside',
                      showlegend=False,
              marker = dict(color = '#19e6e6',
                            line_color = 'black',
                            line_width=3)),row = 1,col = 1)

fig.update_layout(title='Гистограмма TOEFL Score',
                  template='plotly_white')
fig.update_yaxes(range=[0,48])
iplot(fig)

In [11]:
fig = make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(y = df['UniversityRating'].value_counts().values.tolist(),
                      x = df['UniversityRating'].value_counts().index,
                      text=df['UniversityRating'].value_counts().values.tolist(),
              textfont=dict(size=15),
                      textposition = 'outside',
                      showlegend=False,
              marker = dict(color = '#19e6e6',
                            line_color = 'black',
                            line_width=3)),row = 1,col = 1)

fig.update_layout(title='Гистограмма University Rating',
                  template='plotly_white')
fig.update_yaxes(range=[0,200])
iplot(fig)

In [12]:
fig = make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(y = df['SOP'].value_counts().values.tolist(),
                      x = df['SOP'].value_counts().index,
                      text=df['SOP'].value_counts().values.tolist(),
              textfont=dict(size=15),
                      textposition = 'outside',
                      showlegend=False,
              marker = dict(color = '#19e6e6',
                            line_color = 'black',
                            line_width=3)),row = 1,col = 1)

fig.update_layout(title='Density distribution of SOP',
                  template='plotly_white')
fig.update_yaxes(range=[0,100])
iplot(fig)

In [13]:
fig = make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(y = df['LOR'].value_counts().values.tolist(),
                      x = df['LOR'].value_counts().index,
                      text=df['LOR'].value_counts().values.tolist(),
              textfont=dict(size=15),
                      textposition = 'outside',
                      showlegend=False,
              marker = dict(color = '#19e6e6',
                            line_color = 'black',
                            line_width=3)),row = 1,col = 1)

fig.update_layout(title='Гистограмма LOR',
                  template='plotly_white')
fig.update_yaxes(range=[0,100])
iplot(fig)

In [14]:
fig = make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(y = df['Research'].value_counts().values.tolist(),
                      x = df['Research'].value_counts().index,
                      text=df['Research'].value_counts().values.tolist(),
              textfont=dict(size=15),
                      textposition = 'outside',
                      showlegend=False,
              marker = dict(color = '#19e6e6',
                            line_color = 'black',
                            line_width=3)),row = 1,col = 1)

fig.update_layout(title='Гистограмма "Research"',
                  template='plotly_white')
fig.update_yaxes(range=[0,300])
iplot(fig)

In [15]:
fig = make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(y = df['ChanceofAdmit'].value_counts().values.tolist(),
                      x = df['ChanceofAdmit'].value_counts().index,
                      text=df['ChanceofAdmit'].value_counts().values.tolist(),
              textfont=dict(size=15),
                      textposition = 'outside',
                      showlegend=False,
              marker = dict(color = '#19e6e6',
                            line_color = 'black',
                            line_width=3)),row = 1,col = 1)

fig.update_layout(title='Гистограмма "Chance of Admit"',
                  template='plotly_white')
fig.update_yaxes(range=[0,25])
iplot(fig)

## Анализ фичей относительно друг друга

In [16]:
df_corr = df.corr()

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = df_corr.columns,
        y = df_corr.index,
        z = np.array(df_corr),
        text=np.array(df_corr),
        texttemplate="%{text}",
        textfont={"size":20}
    )
)

### 1. GRE и Chance of Admit

In [17]:
fig = px.box(df, x="GREScore", y="ChanceofAdmit")
fig.update_layout(title='GREScore Vs ChanceofAdmit',
                  template='plotly_white')
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.show()

fig = px.scatter(df, x="GREScore", y="ChanceofAdmit",trendline="ols",trendline_color_override = '#000000',width=1000, height=400)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='Ящики с усами GREScore и ChanceofAdmit',
                  template='plotly_white')
fig.show()

Видна сильная положительная связь

### 2. TOEFL и Chance of Admit


In [18]:
fig = px.box(df, x="TOEFLScore", y="ChanceofAdmit")
fig.update_layout(title='TOEFLScore Vs ChanceofAdmit',
                  template='plotly_white')
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.show()

fig = px.scatter(df, x="TOEFLScore", y="ChanceofAdmit", trendline="ols",trendline_color_override = '#000000',width=1000, height=400)
fig.update_layout(title='TOEFLScore Vs ChanceofAdmit',
                  template='plotly_white')
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.show()

Также видна сильная положительная связь

### 3. University Rating и Chance of Admit


In [19]:
fig = px.box(df, x="UniversityRating", y="ChanceofAdmit")
fig.update_layout(title='UniversityRating Vs ChanceofAdmit',
                  template='plotly_white')
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.show()

fig = px.scatter(df, x="UniversityRating", y="ChanceofAdmit", trendline="ols",trendline_color_override = '#000000',width=1000, height=400)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='UniversityRating Vs ChanceofAdmit',
                  template='plotly_white')
fig.show()

На данном графике связь установить не удается

### 4. SOP и Chance of Admit

In [20]:
fig = px.box(df, x="SOP", y="ChanceofAdmit")
fig.update_layout(title='SOP Vs ChanceofAdmit',
                  template='plotly_white')
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.show()
fig = px.scatter(df, x="SOP", y="ChanceofAdmit", trendline="ols",trendline_color_override = '#000000',width=1000, height=400)
fig.update_layout(title='SOP Vs ChanceofAdmit',
                  template='plotly_white')
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.show()

На данном графике связь установить не удается


### 4. LOR и Chance of Admit


In [21]:
fig = px.box(df, x="LOR", y="ChanceofAdmit")
fig.update_layout(title='CGPA Vs ChanceofAdmit',
                  template='plotly_white')
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.show()
fig = px.scatter(df, x="LOR", y="ChanceofAdmit", trendline="ols",trendline_color_override = '#000000',width=1000, height=400)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='LOR Vs ChanceofAdmit',
                  template='plotly_white')
fig.show()

На данном графике связь установить не удается

### 5. CGPA и Chance of Admit


In [22]:
fig = px.box(df, x="CGPA", y="ChanceofAdmit")
fig.update_layout(title='CGPA Vs ChanceofAdmit',
                  template='plotly_white')
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.show()
fig = px.scatter(df, x="CGPA", y="ChanceofAdmit", trendline="ols",trendline_color_override = '#000000',width=1000, height=400)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='CGPA Vs ChanceofAdmit',
                  template='plotly_white')
fig.show()

Сильная положительная связь

### 6. Research и Chance of Admit


In [23]:
fig = px.box(df, x="Research", y="ChanceofAdmit")
fig.update_layout(title='Research Vs ChanceofAdmit',
                  template='plotly_white')
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.show()
fig = px.scatter(df, x="Research", y="ChanceofAdmit", trendline="ols",trendline_color_override = '#000000',width=1000, height=400)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='Research Vs ChanceofAdmit',
                  template='plotly_white')
fig.show()

Ничего(

## Гипотезы

### Проведем категоризацию признака ChanceofAdmit

In [24]:
df['Admission_categories']=0
df.loc[(df['ChanceofAdmit']>=0)&(df['ChanceofAdmit']<=0.35),'Admission_categories']='Low'
df.loc[(df['ChanceofAdmit']>0.35)&(df['ChanceofAdmit']<=0.7),'Admission_categories']='Medium'
df.loc[df['ChanceofAdmit']>0.7,'Admission_categories']='High'

In [25]:
fig = make_subplots(rows=1,cols=1)
fig.add_trace(go.Bar(y = df['Admission_categories'].value_counts().values.tolist(),
                      x = df['Admission_categories'].value_counts().index,
                      text=df['Admission_categories'].value_counts().values.tolist(),
              textfont=dict(size=15),
                      textposition = 'outside',
                      showlegend=False,
              marker = dict(color = '#19e6e6',
                            line_color = 'black',
                            line_width=3)),row = 1,col = 1)

fig.update_layout(title='Density distribution of Admission_categories',
                  template='plotly_white')
fig.update_yaxes(range=[0,350])
iplot(fig)

## 1. Проверка статзначимой связи между GRE и вероятностью быть принятым(Хи-квадрат)

In [26]:
fig = px.histogram(df, x="GREScore", color ="Admission_categories",barmode='group',color_discrete_map = {"High":'#0000ff',"Medium":'#0099ff',"Low": "#4dffff"})

fig.update_layout(bargap = 0.5,title='GREScore Vs Admission_categories',
                  template='plotly_white')
fig.show()

In [27]:
Admission_Gre = pd.crosstab(df["Admission_categories"],df["GREScore"])

GREScore              290  293  294  295  296  297  298  299  300  301  302  \
Admission_categories                                                          
High                    0    0    0    0    0    0    0    0    1    0    0   
Low                     0    0    0    0    0    1    1    0    0    0    0   
Medium                  2    1    2    5    5    5    9   10   11   11    7   

GREScore              303  304  305  306  307  308  309  310  311  312  313  \
Admission_categories                                                          
High                    1    1    2    2    3    3    3    5    6   11    7   
Low                     0    0    0    0    0    0    0    0    0    0    0   
Medium                  4   11    9    5    7   10    6    6   10   13    5   

GREScore              314  315  316  317  318  319  320  321  322  323  324  \
Admission_categories                                                          
High                    8    4    8    7    7    9

In [28]:
stat, p, dof, expected = chi2_contingency(Admission_Gre)
print('Степени свободы = %d' % dof)

Степени свободы = 96


In [29]:
prob = 0.95
critical = chi2.ppf(0.95, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))

probability=0.950, critical=119.871, stat=318.492


In [30]:
if abs(stat) >= critical:
    print('Зависимы')
else:
    print('Независимы')

Зависимы


In [31]:
alpha = 1.0 - prob
print('Значимость = %.3f, p = %.3f' % (alpha, p))
if p <= alpha:
    print('Зависимы')
else:
    print('Независимы')

Значимость = 0.050, p = 0.000
Зависимы


## 2. Проверка статзначимой связи между TOEFL и вероятностью быть принятым(Хи-квадрат)

In [32]:
fig = px.histogram(df, x="TOEFLScore", color ="Admission_categories",barmode='group',color_discrete_map = {"High":'#0000ff',"Medium":'#0099ff',"Low": "#4dffff"})

fig.update_layout(bargap = 0.5,title='TOEFLScore Vs Admission_categories',
                  template='plotly_white')
fig.show()

In [33]:
Admission_Toefl = pd.crosstab(df["Admission_categories"],df["TOEFLScore"])

In [34]:
stat, p, dof, expected = chi2_contingency(Admission_Toefl)
print('Степени свободы = %d' % dof)

Степени свободы = 56


In [35]:
prob = 0.95
critical = chi2.ppf(0.95, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))

probability=0.950, critical=74.468, stat=279.615


In [36]:
if abs(stat) >= critical:
    print('Зависимы')
else:
    print('Независимы')

Зависимы


In [37]:
alpha = 1.0 - prob
print('Уровень значимости = %.3f, p = %.3f' % (alpha, p))
if p <= alpha:
    print('Зависимы')
else:
    print('Независимы')

Уровень значимости = 0.050, p = 0.000
Зависимы


## 3. Проверка статзначимой связи между University Rating и вероятностью быть принятым(Хи-квадрат)

In [38]:
fig = px.histogram(df, x="UniversityRating", color ="Admission_categories",barmode='group',color_discrete_map = {"High":'#0000ff',"Medium":'#0099ff',"Low": "#4dffff"})

fig.update_layout(bargap = 0.5,title='UniversityRating Vs Admission_categories',
                  template='plotly_white')
fig.show()

In [39]:
Admission_rating = pd.crosstab(df["Admission_categories"],df["UniversityRating"])

In [40]:
stat, p, dof, expected = chi2_contingency(Admission_rating)
print('Степени свободы = %d' % dof)
print('Expected frequencies ', expected)

Степени свободы = 8
Expected frequencies  [[19.516 72.324 92.988 60.27  41.902]
 [ 0.136  0.504  0.648  0.42   0.292]
 [14.348 53.172 68.364 44.31  30.806]]


In [41]:
prob = 0.95
critical = chi2.ppf(0.95, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))

probability=0.950, critical=15.507, stat=172.161


In [42]:
if abs(stat) >= critical:
    print('Зависимы')
else:
    print('Независимы')

Зависимы


In [43]:
alpha = 1.0 - prob
print('Уровень значимости = %.3f, p = %.3f' % (alpha, p))
if p <= alpha:
    print('Зависимы')
else:
    print('Независимы')

Уровень значимости = 0.050, p = 0.000
Зависимы


In [44]:
fig = px.histogram(df, x="SOP", color ="Admission_categories",barmode='group',color_discrete_map = {"High":'#0000ff',"Medium":'#0099ff',"Low": "#4dffff"})

fig.update_layout(bargap = 0.5,title='SOP Vs Admission_categories',
                  template='plotly_white')
fig.show()

## 4. Проверка статзначимой связи между SOP и вероятностью быть принятым(Хи-квадрат)

In [45]:
Admission_SOP = pd.crosstab(df["Admission_categories"],df["SOP"])

In [46]:
stat, p, dof, expected = chi2_contingency(Admission_SOP)
print('Степени свободы = %d' % dof)

Степени свободы = 16


In [47]:
prob = 0.95
critical = chi2.ppf(0.95, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))

probability=0.950, critical=26.296, stat=169.418


In [48]:
if abs(stat) >= critical:
    print('Зависимы')
else:
    print('Независимы')

Зависимы


In [49]:
alpha = 1.0 - prob
print('Уровень значимости = %.3f, p = %.3f' % (alpha, p))
if p <= alpha:
    print('Зависимы')
else:
    print('Независимы')

Уровень значимости = 0.050, p = 0.000
Зависимы


## 5. Проверка статзначимой связи между SOP и вероятностью быть принятым(Хи-квадрат)


In [50]:
fig = px.histogram(df, x="LOR", color ="Admission_categories",barmode='group',color_discrete_map = {"High":'#0000ff',"Medium":'#0099ff',"Low": "#4dffff"})

fig.update_layout(bargap = 0.5,title='LOR Vs Admission_categories',
                  template='plotly_white')
fig.show()

In [51]:
Admission_LOR = pd.crosstab(df["Admission_categories"],df["LOR"])

In [52]:
stat, p, dof, expected = chi2_contingency(Admission_LOR)
print('Степени свободы = %d' % dof)

Степени свободы = 16


In [53]:
prob = 0.95
critical = chi2.ppf(0.95, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))

probability=0.950, critical=26.296, stat=154.788


In [54]:
if abs(stat) >= critical:
    print('Зависимы')
else:
    print('Независимы')

Зависимы


In [55]:
alpha = 1.0 - prob
print('Уровень значимости = %.3f, p = %.3f' % (alpha, p))
if p <= alpha:
    print('Зависимы')
else:
    print('Независимы')

Уровень значимости = 0.050, p = 0.000
Зависимы


## 6. Проверка статзначимой связи между CGPA и вероятностью быть принятым(Хи-квадрат)


In [56]:
fig = px.histogram(df, x="CGPA", color ="Admission_categories",barmode='group',color_discrete_map = {"High":'#0000ff',"Medium":'#0099ff',"Low": "#4dffff"})

fig.update_layout(bargap = 0.5,title='CGPA Vs Admission_categories',
                  template='plotly_white')
fig.show()

In [57]:
Admission_CGPA = pd.crosstab(df["Admission_categories"],df["CGPA"])

In [58]:
stat, p, dof, expected = chi2_contingency(Admission_CGPA)
print('Степени свободы = %d' % dof)

Степени свободы = 366


In [59]:
prob = 0.95
critical = chi2.ppf(0.95, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))

probability=0.950, critical=411.610, stat=517.268


In [60]:
if abs(stat) >= critical:
    print('Зависимы')
else:
    print('Независимы')

Зависимы


In [61]:
alpha = 1.0 - prob
print('Уровень значимости = %.3f, p = %.3f' % (alpha, p))
if p <= alpha:
    print('Зависимы')
else:
    print('Независимы')

Уровень значимости = 0.050, p = 0.000
Зависимы


## 7. Проверка статзначимой связи между SOP и вероятностью быть принятым(Хи-квадрат)

In [62]:
fig = px.histogram(df, x="Research", color ="Admission_categories",barmode='group',color_discrete_map = {"High":'#0000ff',"Medium":'#0099ff',"Low": "#4dffff"})

fig.update_layout(bargap = 0.5,title='Research Vs Admission_categories',
                  template='plotly_white')
fig.show()

In [63]:
Admission_R = pd.crosstab(df["Admission_categories"],df["Research"])

In [64]:
stat, p, dof, expected = chi2_contingency(Admission_R)
print('Степени свободы = %d' % dof)

Степени свободы = 2


In [65]:
prob = 0.95
critical = chi2.ppf(0.95, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))

probability=0.950, critical=5.991, stat=121.247


In [66]:
if abs(stat) >= critical:
    print('Зависимы')
else:
    print('Независимы')

Зависимы


In [67]:
alpha = 1.0 - prob
print('Уровень значимости = %.3f, p = %.3f' % (alpha, p))
if p <= alpha:
    print('Зависимы')
else:
    print('Независимы')

Уровень значимости = 0.050, p = 0.000
Зависимы


# Мультиколлинеарность

In [68]:
fig = make_subplots(rows=1, cols=1, specs=[[{'type': 'xy'}]])

fig = px.scatter(df, x="TOEFLScore", y="GREScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='TOEFL Vs GREScore',
                  template='plotly_white')
fig.show()



fig = px.scatter(df, x="UniversityRating",y="GREScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='UniversityRating Vs GREScore',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="SOP", y="GREScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='CGPA Vs GREScore',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="LOR", y="GREScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='LOR Vs ChanceofAdmit',
                  template='plotly_white')
fig.show()



fig = px.scatter(df, x="CGPA",y="GREScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='UniversityRating Vs GREScore',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="Research", y="GREScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='LOR Vs GREScore',
                  template='plotly_white')
fig.show()

In [69]:
fig = make_subplots(rows=1, cols=1, specs=[[{'type': 'xy'}]])

fig = px.scatter(df, x="GREScore", y="TOEFLScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='GREScore Vs TOEFLScore',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="UniversityRating",y="TOEFLScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='UniversityRating Vs TOEFLScore',
                  template='plotly_white')
fig.show()



fig = px.scatter(df, x="SOP", y="TOEFLScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='SOP Vs TOEFLScore',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="LOR", y="TOEFLScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='LOR Vs TOEFLScore',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="CGPA",y="TOEFLScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='CGPA Vs TOEFLScore',
                  template='plotly_white')
fig.show()



fig = px.scatter(df, x="Research", y="TOEFLScore", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='Research Vs TOEFLScore',
                  template='plotly_white')
fig.show()

In [70]:
fig = make_subplots(rows=1, cols=1, specs=[[{'type': 'xy'}]])


fig = px.scatter(df, x="GREScore", y="UniversityRating", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='GREScore Vs UniversityRating',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="TOEFLScore",y="UniversityRating", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='TOEFL Vs UniversityRating',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="SOP", y="UniversityRating", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='SOP Vs UniversityRating',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="LOR", y="UniversityRating", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='LOR Vs UniversityRating',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="CGPA",y="UniversityRating", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='CGPA Vs UniversityRating',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="Research", y="UniversityRating", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='Research Vs UniversityRating',
                  template='plotly_white')
fig.show()

In [71]:
fig = make_subplots(rows=1, cols=1, specs=[[{'type': 'xy'}]])
fig = px.scatter(df, x="GREScore", y="SOP", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='GREScore Vs SOP',
                  template='plotly_white')
fig.show()
fig = px.scatter(df, x="TOEFLScore",y="SOP", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='TOEFL Vs SOP',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="UniversityRating", y="SOP", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='UniversityRating Vs SOP',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="LOR", y="SOP", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='LOR Vs SOP',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="CGPA",y="SOP", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='CGPA Vs SOP',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="Research", y="SOP", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='Research Vs SOP',
                  template='plotly_white')
fig.show()

In [72]:
fig = make_subplots(rows=1, cols=1, specs=[[{'type': 'xy'}]])

fig = px.scatter(df, x="GREScore", y="LOR", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='GREScore Vs LOR',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="TOEFLScore",y="LOR", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='TOEFL Vs LOR',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="UniversityRating", y="LOR", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='UniversityRating Vs LOR',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="SOP", y="LOR", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='SOP Vs LOR',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="CGPA",y="LOR", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='CGPA Vs LOR',
                  template='plotly_white')
fig.show()


fig = px.scatter(df, x="Research", y="LOR", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='Research Vs LOR',
                  template='plotly_white')
fig.show()

In [73]:
fig = make_subplots(rows=1, cols=1, specs=[[{'type': 'xy'}]])
fig = px.scatter(df, x="GREScore", y="CGPA", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='GREScore Vs CGPA',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="TOEFLScore",y="CGPA", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='TOEFL Vs CGPA',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="UniversityRating", y="CGPA", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='UniversityRating Vs CGPA',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="LOR", y="CGPA", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='LOR Vs CGPA',
                  template='plotly_white')
fig.show()

fig = px.scatter(df, x="SOP",y="CGPA", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='SOP Vs CGPA',
                  template='plotly_white')
fig.show()
fig = px.scatter(df, x="Research", y="CGPA", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='Research Vs CGPA',
                  template='plotly_white')
fig.show()

In [74]:
fig = make_subplots(rows=1, cols=1, specs=[[{'type': 'xy'}]])
plt.figure(figsize = (10,8))

fig = px.scatter(df, x="GREScore", y="Research", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='GREScore Vs Research',
                  template='plotly_white')
fig.show()
fig = px.scatter(df, x="TOEFLScore",y="Research", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='TOEFL Vs Research',
                  template='plotly_white')
fig.show()
fig = px.scatter(df, x="UniversityRating", y="Research", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='UniversityRating Vs Research',
                  template='plotly_white')
fig.show()
fig = px.scatter(df, x="LOR", y="Research", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='LOR Vs Research',
                  template='plotly_white')
fig.show()
fig = px.scatter(df, x="CGPA",y="Research", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='CGPA Vs Research',
                  template='plotly_white')
fig.show()
fig = px.scatter(df, x="SOP", y="Research", trendline="ols",trendline_color_override = '#000000',width=800, height=300)
fig.update_traces(marker_size=12,marker_color='#19e6e6')
fig.update_layout(title='SOP Vs Research',
                  template='plotly_white')
fig.show()

<Figure size 1000x800 with 0 Axes>

# Разделение выборки

In [75]:
df = pd.read_csv("Admission_Predict_Ver1.1.csv")
del df['Serial No.']
df.columns = df.columns.str.replace(' ', '')
X= df.drop(["ChanceofAdmit"],axis =1)
y= df["ChanceofAdmit"]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=7)

# Модели

In [91]:
pipeline_lr=Pipeline([("scalar1",StandardScaler()),
                      ("LR",LinearRegression())])

pipeline_dt=Pipeline([("scalar2",StandardScaler()),
                      ("DT",DecisionTreeRegressor())])

pipeline_rf=Pipeline([("scalar3",StandardScaler()),
                      ("RF",RandomForestRegressor())])

pipeline_knn=Pipeline([("scalar4",StandardScaler()),
                       ("KN",KNeighborsRegressor())])

pipeline_xgb=Pipeline([("scalar5",StandardScaler()),
                       ("XGB",XGBRegressor())])


pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_knn, pipeline_xgb]
pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest",3: "KNeighbors", 4: "XGBRegressor"}


for pipe in pipelines:
    pipe.fit(X_train, y_train)

cv_results_rms = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X_train,y_train,scoring="max_error", cv=10)
    cv_results_rms.append(cv_score)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

LinearRegression: -0.162411 
DecisionTree: -0.266000 
RandomForest: -0.176620 
KNeighbors: -0.185600 
XGBRegressor: -0.190674 


# Метрики

In [77]:
def vanilla_linreg_fit_instance(X, y):
    some_lr =Pipeline([("scalar1",StandardScaler()),
                      ("LR",LinearRegression())])
    some_lr.fit(X, y)
    return some_lr

def get_metrics(fitted_instance, X_test, y_test):
    pred = fitted_instance.predict(X_test)
    r2 = metrics.r2_score(y_test, pred)
    Adjusted_r2 = 1 - (1-metrics.r2_score(y_test, pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    mae = metrics.mean_absolute_error(y_test, pred)
    mse = metrics.mean_squared_error(y_test, pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, pred))
    ResultData = [[r2],[Adjusted_r2],[mae],[mse],[rmse]]
    Results = pd.DataFrame(ResultData,columns= ["Scores"] ,index = ["R-Squared","Adjusted R-Squared", "Mean Absolute Error","Mean Square Error","Root Mean Square Error"])
    return Results
linreg = vanilla_linreg_fit_instance(X_train, y_train)
get_metrics(linreg, X_test, y_test)

Unnamed: 0,Scores
R-Squared,0.796192
Adjusted R-Squared,0.783998
Mean Absolute Error,0.045297
Mean Square Error,0.003989
Root Mean Square Error,0.063159


Снова посмотрим на коррелограмму и уберем сильно скоррелированные признаки

Например, уберем TOEFL score и UniversityRating

In [89]:
X2_train, X2_test =  X_train.drop(["TOEFLScore", "UniversityRating", "Research", "SOP"],axis =1), X_test.drop(["TOEFLScore", "UniversityRating", "Research", "SOP"],axis =1)
linreg = vanilla_linreg_fit_instance(X2_train, y_train)
get_metrics(linreg, X2_test, y_test)

Unnamed: 0,Scores
R-Squared,0.808997
Adjusted R-Squared,0.804261
Mean Absolute Error,0.043492
Mean Square Error,0.003738
Root Mean Square Error,0.061142


In [90]:
df_corr = X2_test.corr()

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = df_corr.columns,
        y = df_corr.index,
        z = np.array(df_corr),
        text=np.array(df_corr),
        texttemplate="%{text}",
        textfont={"size":20}
    )
)

In [92]:
some_new_student = [(332,110,3,4.0,4.5,8.6,1)]
new_array = np.asarray(some_new_student)
labels=["Принят","Отклонен"]
prediction=pipeline_lr.predict(new_array)
no_of_test_cases, cols = new_array.shape
for i in range(no_of_test_cases):
 print("GRE {}, GPA {}, Rank {} ----- {}".format(some_new_student[i][0],some_new_student[i][1],some_new_student[i][2], labels[int(prediction[i])]))

GRE 332, GPA 110, Rank 3 ----- Принят
