<a href="https://colab.research.google.com/github/josedvelez1/Kaggle/blob/main/99_modelo_soluci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modelo Solución con Árbol de Decisiones

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
!pip install Unidecode
import unidecode
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report



## Lectura del archivo

In [None]:
students_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
test_df.shape

(296786, 12)

## Limpieza de los datos

In [None]:
# Las funciones también fueron extraidas de los Colabs compartidos por el profesor
def to_onehot(x):
    values = np.unique(x)
    r = np.r_[[np.argwhere(i==values)[0][0] for i in x]]
    return np.eye(len(values))[r].astype(int)

def replace_column_with_onehot(d, col):
    assert sum(d[col].isna())==0, "column must have no NaN values"
    values = np.unique(d[col])
    k = to_onehot(d[col].values)
    r = d.join(pd.DataFrame(k, columns=["%s_%s"%(col, values[i]) for i in range(k.shape[1])], index=d.index), how='outer')
    del(r[col])
    return r

def replace_category_for_numbers(d, col, vals_to_change):
  r = d.copy()
  r[col] = r[col].replace(vals_to_change.keys(), vals_to_change.values())
  return r

def replace_word(word, sentence: str):
  return sentence.replace(" DE ", " ")

def replace_wrong_words(sentence):
  result = sentence
  wrong_words_dict = {
      '3DEG ': '',
      ' PRIMARIA ': ' ', # Hay ciertas carreras que tienen esta palabra extra
      ' PRIMARIA:': ' ',
      'INGENIER?A': 'INGENIERIA',
      'II': 'I',
      'EDUCACI?N': 'EDUCACION',
      'F?SICA': 'FISICA',
      'RECREACI?N': 'RECREACION',
      '?NFASIS': 'ENFASIS',
      'INGL?S': 'INGLES',
      'DISE?O': 'DISENO',
      'COMUNICACI?N': 'COMUNICACION',
      'QU?MICA': 'QUIMICA',
      'MATEM?TICAS': 'MATEMATICAS',
      'EL?CTRICA': 'ELECTRICA',
      'M?SICA': 'MUSICA',
      'DISE??O': 'DISENO',
      'COMUNICACI??N': 'COMUNICACION',
      'QU?MICA': 'QUIMICA',
      'GESTI?N': 'GESTION',
      'GR??FICA': 'GRAFICA',
      'B?SICA': 'BASICA',
      'ADMINISTRACI?N': 'ADMINISTRACION',
      'P?BLICA': 'PUBLICA',
      'FARMAC?UTICA': 'FARMACEUTICA',
      'FILOSOF?A': 'FILOSOFIA',
      'ESC?NICAS': 'ESCENICAS',
      'ECONOM?A': 'ECONOMIA',
      'GASTRONOM?A': 'GASTRONOMIA',
      'GEOLOG?A': 'GEOLOGIA',
      'LOG?STICA': 'LOGISTICA',
      'ART?STICA': 'ARTISTICA',
      'PEDAGOG?A': 'PEDAGOGIA',
      ' DE ': ' ',
      ' EN ': ' ',
      '  ': ' ',
      '.': '',
      '- ': ' ',
  }
  for wrong, well in wrong_words_dict.items():
    if wrong in result: result = result.replace(wrong, well)
  return result

#### Eliminación de columnas con más correlación

### Entrenamiento

In [None]:
def format_df(dataframe):
  students_df = dataframe.drop(['ID'], axis=1)
  students_df = students_df.fillna('Miss')
  columns_to_onehot =[
    'FAMI_TIENEINTERNET',
    'ESTU_PAGOMATRICULAPROPIO',
  ]
  onehoted_df = students_df.copy()
  for col in columns_to_onehot:
    onehoted_df = replace_column_with_onehot(onehoted_df, col)
  onehoted_df

  onehoted_df = onehoted_df.drop(['FAMI_TIENEINTERNET_Miss'], axis=1)
  onehoted_df

  data_matricula_dict = {
    'Miss': 0,
    'No pagó matrícula': 1,
    'Menos de 500 mil': 2,
    'Entre 500 mil y menos de 1 millón': 3,
    'Entre 1 millón y menos de 2.5 millones': 4,
    'Entre 2.5 millones y menos de 4 millones': 5,
    'Entre 4 millones y menos de 5.5 millones': 6,
    'Entre 5.5 millones y menos de 7 millones': 7,
    'Más de 7 millones': 8,
 }

  data_working_hours_dict = {
      'Miss': 0,
      '0': 1,
      'Menos de 10 horas': 2,
      'Entre 11 y 20 horas': 3,
      'Entre 21 y 30 horas': 4,
      'Más de 30 horas': 5,
  }

  data_estrato_dict = {
      'Miss': -1,
      'Sin Estrato': 0,
      'Estrato 1': 1,
      'Estrato 2': 2,
      'Estrato 3': 3,
      'Estrato 4': 4,
      'Estrato 5': 5,
      'Estrato 6': 6,
  }

  data_education_dict = {
      'Miss': -2,
      'No Aplica': -1,
      'Ninguno': 0,
      'No sabe': 1,
      'Primaria incompleta': 2,
      'Primaria completa': 3,
      'Secundaria (Bachillerato) incompleta': 4,
      'Secundaria (Bachillerato) completa': 5,
      'Técnica o tecnológica incompleta': 6,
      'Técnica o tecnológica completa': 7,
      'Educación profesional incompleta': 8,
      'Educación profesional completa': 9,
      'Postgrado': 10,
  }

  data_performance_dict = {
      'bajo': 1,
      'medio-bajo': 2,
      'medio-alto': 3,
      'alto': 4,
  }

  onehoted_df = replace_category_for_numbers(
      onehoted_df,
      "ESTU_VALORMATRICULAUNIVERSIDAD",
      data_matricula_dict)

  onehoted_df = replace_category_for_numbers(
      onehoted_df,
      "ESTU_HORASSEMANATRABAJA",
      data_working_hours_dict)

  onehoted_df = replace_category_for_numbers(
      onehoted_df,
      "FAMI_ESTRATOVIVIENDA",
      data_estrato_dict)

  onehoted_df = replace_category_for_numbers(
      onehoted_df,
      "FAMI_EDUCACIONPADRE",
      data_education_dict)

  onehoted_df = replace_category_for_numbers(
      onehoted_df,
      "FAMI_EDUCACIONMADRE",
      data_education_dict)
  if "RENDIMIENTO_GLOBAL" in onehoted_df.columns:
    onehoted_df = replace_category_for_numbers(
        onehoted_df,
        "RENDIMIENTO_GLOBAL",
        data_performance_dict)
  onehoted_df["ESTU_PRGM_ACADEMICO"] = onehoted_df["ESTU_PRGM_ACADEMICO"].map(unidecode.unidecode)
  onehoted_df["ESTU_PRGM_ACADEMICO"] = onehoted_df["ESTU_PRGM_ACADEMICO"].map(lambda sentence: sentence.upper())
  onehoted_df["ESTU_PRGM_ACADEMICO"] = onehoted_df["ESTU_PRGM_ACADEMICO"].map(replace_wrong_words)
  onehoted_df["ESTU_PRGM_DEPARTAMENTO"] = onehoted_df["ESTU_PRGM_DEPARTAMENTO"].map(unidecode.unidecode)
  onehoted_df

  final_student_df = replace_column_with_onehot(onehoted_df, 'ESTU_PRGM_DEPARTAMENTO')
  final_student_df['ESTU_PRGM_ACADEMICO'] = final_student_df['ESTU_PRGM_ACADEMICO'].astype('category').cat.codes
  final_student_df

  return final_student_df



In [None]:
final_student_df = format_df(students_df)

y = final_student_df["RENDIMIENTO_GLOBAL"].values
X = final_student_df.drop(['RENDIMIENTO_GLOBAL'], axis=1)
del final_student_df
print (X.shape, y.shape)

  r[col] = r[col].replace(vals_to_change.keys(), vals_to_change.values())


(692500, 43) (692500,)


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

def fit_and_score(estimator, X, y, test_size):
    """Fit the estimator on the train set and score it on both sets"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    estimator.fit(X_train, y_train)

    train_score = estimator.score(X_train, y_train)
    test_score = estimator.score(X_test, y_test)

    y_pred = estimator.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return estimator, train_score, test_score, accuracy, report

In [None]:
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'random_state': [42]
}

classifier = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
estimator = GridSearchCV(estimator=classifier, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
estimator, train_score, test_score, accuracy, report = fit_and_score(estimator, X, y, 0.3)
print(f"Train Score: {train_score:.3f}")
print(f"Test Score: {test_score:.3f}")
print(f"Accuracy: {accuracy:.3f}")
print("Classification Report:")
print(report)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Train Score: 0.404
Test Score: 0.398
Accuracy: 0.398
Classification Report:
              precision    recall  f1-score   support

           1       0.41      0.54      0.46     51767
           2       0.31      0.22      0.26     51712
           3       0.30      0.23      0.26     51439
           4       0.50      0.58      0.54     52832

    accuracy                           0.40    207750
   macro avg       0.38      0.40      0.38    207750
weighted avg       0.38      0.40      0.38    207750



In [None]:
final_student_df = format_df(test_df)

X = final_student_df.copy()
del final_student_df
X = X.loc[:, ~X.columns.str.contains('^Unnamed')]
y_predict = estimator.predict(X)


  r[col] = r[col].replace(vals_to_change.keys(), vals_to_change.values())


In [None]:
data_performance_dict = {
      1: 'bajo',
      2:'medio-bajo',
      3:'medio-alto',
      4:'alto',
}
y = list(map(lambda x: data_performance_dict[x], y_predict))
index = list(X.index)
response = pd.DataFrame({
    'ID': index,
    'RENDIMIENTO_GLOBAL': y
}).set_index('ID')
response.to_csv("out.csv")