# Preparacion de los datos

En esta seccion realizaremos la lectura del Excel, asi como su limpieza y de la creacion del dataframe con el que trabajaremos

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns


Definiremos una funcion para la **lectura** 

In [2]:
def lectura(path, limit = 250):
    '''
    Parameters
    
    path La ruta del archivo excel
    limit Hasta que universidad tomara
    
    Returns
    
    Un dataframe con todos los años
    
    '''
    df_raw = pd.DataFrame()
    for i in range(2011,2022):
        try:
            df_raw = pd.concat([df_raw, pd.read_excel(path,sheet_name=str(i)).head(limit)])
        except FileNotFoundError:
            print('Archivo no encontrado')
            return None
    return df_raw
        
    

Y una para **limpieza**

In [3]:
def leer(path):
    '''
    Parameters
    
    path La ruta del archivo excel
    
    Returns
    
    Un dataframe limpio
    
    '''
    df = lectura(path)

    try:
        # Limpiar los rangos
        df['Rank'] = df['Rank'].fillna(method='ffill')
        df['O_Rank'] = pd.to_numeric(df['O_Rank'],errors = 'coerce')
        df['O_Rank'] = df['O_Rank'].fillna(method='ffill')
        df['AR Rank'] = pd.to_numeric(df['AR Rank'],errors = 'coerce')
        df['AR Rank'] = df['AR Rank'].fillna(method='ffill')
        df['ER Rank'] = pd.to_numeric(df['ER Rank'],errors = 'coerce')
        df['ER Rank'] = df['ER Rank'].fillna(method='ffill')
        df['FS Rank'] = df['FS Rank'].fillna(method='ffill')
        df['CF Rank'] = df['CF Rank'].fillna(method='ffill')
        df['IF Rank'] = df['IF Rank'].fillna(method='ffill')
        df['IS Rank'] = df['IS Rank'].fillna(method='ffill')
        # Limpiar las reputaciones
        df['Academic Reputation'] = df['Academic Reputation'].fillna(
            df['Academic Reputation'].mean)
        df['Employer Reputation'] = pd.to_numeric(df['Employer Reputation'],errors = 'coerce')
        df['Employer Reputation'] = df['Employer Reputation'].fillna(
            df['Employer Reputation'].mean())
        # Limpiar faculty
        df['Faculty Student'] = pd.to_numeric(df['Faculty Student'],errors = 'coerce')
        df['Faculty Student'] = df['Faculty Student'].fillna(
            df['Faculty Student'].median())
        df['Citations per Faculty'] = df['Citations per Faculty'].fillna(
            df['Citations per Faculty'].median())
        df['International Students'] = df['International Students'].fillna(
            df['International Students'].median())
        df['International Faculty'] = pd.to_numeric(df['International Faculty'],errors = 'coerce')
        df['International Faculty'] = df['International Faculty'].fillna(
        df['International Faculty'].median())
        # Limpiar el overall
        df['Overall Score'] = df['Overall Score'].fillna(df['Overall Score'].mean())
        df = df.replace('601+', int(601))
    except TypeError:
        print('Error en la lectura')
        return None
    return df
    

Finalmente guardamos el Dataframe

In [4]:
df = leer('../data/raw/QS WUR 2011-2022.xlsx')

In [5]:
df.isna().sum()

Year                      0
Rank                      0
O_Rank                    0
Institution               0
Academic Reputation       0
AR Rank                   0
Employer Reputation       0
ER Rank                   0
Faculty Student           0
FS Rank                   0
Citations per Faculty     0
CF Rank                   0
International Faculty     0
IF Rank                   0
International Students    0
IS Rank                   0
Overall Score             0
dtype: int64

# Modelacion

En esta etapa analizaremos los datos, principalmente 3 analisis

* Regresion lineal multiple, para predecir la importancia de cada Categoria en el *Overall Score*
* Regresion lineal simple, con el Tec para predecir su posicion en los rankings el proximo año
* Importancia de cada **Rank** 

## Regresion lineal multiple

In [6]:
df.columns

Index(['Year', 'Rank', 'O_Rank', 'Institution ', 'Academic Reputation',
       'AR Rank', 'Employer Reputation', 'ER Rank', 'Faculty Student',
       'FS Rank', 'Citations per Faculty', 'CF Rank', 'International Faculty',
       'IF Rank', 'International Students', 'IS Rank', 'Overall Score'],
      dtype='object')

In [7]:
df

Unnamed: 0,Year,Rank,O_Rank,Institution,Academic Reputation,AR Rank,Employer Reputation,ER Rank,Faculty Student,FS Rank,Citations per Faculty,CF Rank,International Faculty,IF Rank,International Students,IS Rank,Overall Score
0,2011.0,1.0,1.0,University of Cambridge,100.0,1.0,100.0,3.0,100.0,18.0,93.0,36.0,96.0,38.0,95.0,39.0,100.0
1,2011.0,2.0,2.0,Harvard University,100.0,2.0,100.0,1.0,97.0,40.0,100.0,3.0,71.0,125.0,87.0,70.0,99.2
2,2011.0,3.0,3.0,Yale University,100.0,10.0,100.0,10.0,100.0,7.0,98.0,22.0,78.0,109.0,66.0,149.0,98.7
3,2011.0,4.0,4.0,UCL (University College London),99.0,23.0,93.0,34.0,99.0,24.0,91.0,41.0,94.0,45.0,99.0,18.0,98.5
4,2011.0,5.0,5.0,Massachusetts Institute of Technology (MIT),100.0,6.0,100.0,4.0,100.0,21.0,100.0,9.0,31.0,265.0,98.0,24.0,98.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,2021.0,246.0,245.0,Queen's University,36.5,229.0,68.8,99.0,7.5,601.0,48.2,235.0,71.0,255.0,28.8,438.0,37.8
246,2021.0,246.0,245.0,University of Calgary,33.0,257.0,36.6,262.0,27.2,539.0,41.2,311.0,82.9,212.0,57.1,252.0,37.8
247,2021.0,246.0,245.0,University of Sussex,28.0,301.0,10.2,262.0,20.2,601.0,58.7,141.0,96.4,126.0,95.1,83.0,37.8
248,2021.0,246.0,245.0,Wuhan University,38.2,219.0,53.7,150.0,16.8,601.0,53.4,177.0,48.6,356.0,8.5,601.0,37.8


In [8]:
columnas = ['Rank', 'O_Rank', 'Academic Reputation',
       'AR Rank', 'Employer Reputation', 'ER Rank', 'Faculty Student',
       'FS Rank', 'Citations per Faculty', 'CF Rank', 'International Faculty',
       'IF Rank', 'International Students', 'IS Rank']

In [9]:
X = df[columnas].values

Y = df['Overall Score'].values


Para evitar _overfitting_ separamos los datos en 80% para entrenamiento y 20% para test

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0)


Aplicamos el modelo de regresion lineal con los datos de entrenamiento

In [11]:
modelo = LinearRegression()


In [12]:
modelo.fit(X_train, Y_train)


LinearRegression()