## Solución Prueba. SQL para Data Science

**Alumno:** Julio Valdés 

**Profesor:** Gabriel Tamayo L.

**Generación:** G5

In [1]:
import pandas as pd
import re
import csv
import psycopg2
from sqlalchemy import create_engine
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import helpers
import glob
import os

### Parte 1: Registro de los archivos en la base de datos

#### Generar una nueva base de datos con la siguiente nomenclatura: <code>apellido_nombre</code>
<code>createdb -U postgres valdes_julio</code>

#### Importar en tablas los archivos <code>train_cupid.csv</code> y <code>test_cupid.csv</code> a un motor Postgres, implementando sólo la librería <code>psycopg2</code>. Las tablas deben contener los nombres de las columnas y el total de los registros presente en cada archivo.

In [2]:
df_train = pd.read_csv('train_cupid.csv')
df_train.head(2)

Unnamed: 0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying to quit,smokes_when drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
0,35,70.0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
1,38,68.0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0


In [3]:
df_test = pd.read_csv('test_cupid.csv')
df_test.head(2)

Unnamed: 0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying to quit,smokes_when drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
0,22,75.0,0,0,0,0,0,0,1,0,...,1,1,1,0,0,0,0,0,0,1
1,32,65.0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [4]:
def fields_generate(columns):
    fields = []
    for column in columns:
        if df_train[column].dtype == 'int64':
            dtype = 'integer'
        elif df_train[column].dtype == 'float64':
            dtype = 'float'
        else:
            dtype = 'varchar(255)'

        column_name = re.sub('[^0-9a-zA-Z_]+', '_', column)
        fields.append(column_name + " " + dtype)
    
    return fields

#### Se crea conexión a la BD

In [5]:
conn = psycopg2.connect(host='localhost', dbname='valdes_julio', user='postgres', password='password')

#### Se crean las tablas train_cupid y test_cupid

In [6]:
cursor = conn.cursor()
fields = fields_generate(df_train.columns)
sql_create_table = "CREATE TABLE train_cupid("
sql_create_table += ', '.join(fields)
sql_create_table += ")"
cursor.execute(sql_create_table)
conn.commit()

fields = fields_generate(df_test.columns)
sql_create_table = "CREATE TABLE test_cupid("
sql_create_table += ', '.join(fields)
sql_create_table += ")"
cursor.execute(sql_create_table)
conn.commit()
cursor.close()

#### Se ingresan los datos en tablas train_cupid y test_cupid

In [7]:
cursor = conn.cursor()
with open('./train_cupid.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        cursor.execute("INSERT INTO train_cupid VALUES (" + 97*"%s, " + "%s)", row)
    conn.commit()
cursor.close()

In [8]:
cursor = conn.cursor()
with open('./test_cupid.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        cursor.execute("INSERT INTO test_cupid VALUES (" + 97*"%s, " + "%s)", row)
    conn.commit()
cursor.close()

### Parte 2: Entrenamiento de modelos

#### Ingestar la tabla de training mediante psycopg2 para el posterior entrenamiento del modelo.

In [10]:
cursor = conn.cursor()
cursor.execute("SELECT * FROM train_cupid;")
data = cursor.fetchall()
df = pd.DataFrame(list(data))
df.columns = [desc[0] for desc in cursor.description]
cursor.close()
df.head(5)

Unnamed: 0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying_to_quit,smokes_when_drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
0,35,70.0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
1,38,68.0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
2,23,71.0,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,1,0,1
3,29,66.0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
4,29,67.0,0,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,1


#### Entrenar los siguientes modelos (sin necesidad de ajustar por hiperparámetros)
* <code>GradientBoostingClassifier</code>, <code>AdaBoostClassifier</code>, <code>RandomForestClassifier</code>, <code>SVC</code>, <code>DecisionTreeClassifier</code>, <code>LogisticRegression</code>, <code>BernoulliNB</code> 
* Existen tres vectores objetivos a evaluar: <code>single</code>, <code>seeing_someone</code> y <code>available</code>
* Serializar el objeto y preservarlo por cada combinación de modelo entrenado y vector objetivo.

In [11]:
models = [GradientBoostingClassifier(), AdaBoostClassifier(), RandomForestClassifier(), 
          SVC(), DecisionTreeClassifier(), LogisticRegression(), BernoulliNB()]
targets = ['single', 'seeing_someone', 'available']

for target in targets:
    X = df.copy()
    obj_vect = X[target]
    X = X.drop(columns=targets)
    X_train, X_test, y_train, y_test = train_test_split(X, obj_vect, test_size=.33, random_state=11238)
    for model in models:
        print("Reporte para modelo {} y vector objetivo {}".format(model.__class__, target))
        helpers.report_performance(model, X_train, X_test, y_train, y_test, pickle_it = True)

Reporte para modelo <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> y vector objetivo single
              precision    recall  f1-score   support

           0       0.53      0.02      0.03       526
           1       0.92      1.00      0.96      6101

   micro avg       0.92      0.92      0.92      6627
   macro avg       0.73      0.51      0.49      6627
weighted avg       0.89      0.92      0.88      6627

Reporte para modelo <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'> y vector objetivo single
              precision    recall  f1-score   support

           0       0.43      0.02      0.03       526
           1       0.92      1.00      0.96      6101

   micro avg       0.92      0.92      0.92      6627
   macro avg       0.68      0.51      0.50      6627
weighted avg       0.88      0.92      0.88      6627

Reporte para modelo <class 'sklearn.ensemble.forest.RandomForestClassifier'> y vector objetivo single




              precision    recall  f1-score   support

           0       0.21      0.05      0.08       526
           1       0.92      0.98      0.95      6101

   micro avg       0.91      0.91      0.91      6627
   macro avg       0.57      0.52      0.52      6627
weighted avg       0.87      0.91      0.88      6627

Reporte para modelo <class 'sklearn.svm.classes.SVC'> y vector objetivo single


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       526
           1       0.92      1.00      0.96      6101

   micro avg       0.92      0.92      0.92      6627
   macro avg       0.46      0.50      0.48      6627
weighted avg       0.85      0.92      0.88      6627

Reporte para modelo <class 'sklearn.tree.tree.DecisionTreeClassifier'> y vector objetivo single
              precision    recall  f1-score   support

           0       0.14      0.19      0.16       526
           1       0.93      0.90      0.91      6101

   micro avg       0.85      0.85      0.85      6627
   macro avg       0.53      0.54      0.54      6627
weighted avg       0.87      0.85      0.86      6627

Reporte para modelo <class 'sklearn.linear_model.logistic.LogisticRegression'> y vector objetivo single
              precision    recall  f1-score   support

           0       0.40      0.01      0.02       526
           1       0.92      1.00    

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.00      0.00      0.00       249

   micro avg       0.96      0.96      0.96      6627
   macro avg       0.48      0.50      0.49      6627
weighted avg       0.93      0.96      0.94      6627

Reporte para modelo <class 'sklearn.ensemble.forest.RandomForestClassifier'> y vector objetivo seeing_someone
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.10      0.00      0.01       249

   micro avg       0.96      0.96      0.96      6627
   macro avg       0.53      0.50      0.49      6627
weighted avg       0.93      0.96      0.94      6627

Reporte para modelo <class 'sklearn.svm.classes.SVC'> y vector objetivo seeing_someone


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.00      0.00      0.00       249

   micro avg       0.96      0.96      0.96      6627
   macro avg       0.48      0.50      0.49      6627
weighted avg       0.93      0.96      0.94      6627

Reporte para modelo <class 'sklearn.tree.tree.DecisionTreeClassifier'> y vector objetivo seeing_someone
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      6378
           1       0.06      0.09      0.07       249

   micro avg       0.91      0.91      0.91      6627
   macro avg       0.51      0.52      0.51      6627
weighted avg       0.93      0.91      0.92      6627

Reporte para modelo <class 'sklearn.linear_model.logistic.LogisticRegression'> y vector objetivo seeing_someone


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.00      0.00      0.00       249

   micro avg       0.96      0.96      0.96      6627
   macro avg       0.48      0.50      0.49      6627
weighted avg       0.93      0.96      0.94      6627

Reporte para modelo <class 'sklearn.naive_bayes.BernoulliNB'> y vector objetivo seeing_someone
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.00      0.00      0.00       249

   micro avg       0.96      0.96      0.96      6627
   macro avg       0.48      0.50      0.49      6627
weighted avg       0.93      0.96      0.94      6627

Reporte para modelo <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> y vector objetivo available
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.0

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.00      0.00      0.00       249

   micro avg       0.96      0.96      0.96      6627
   macro avg       0.48      0.50      0.49      6627
weighted avg       0.93      0.96      0.94      6627

Reporte para modelo <class 'sklearn.ensemble.forest.RandomForestClassifier'> y vector objetivo available
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.17      0.00      0.01       249

   micro avg       0.96      0.96      0.96      6627
   macro avg       0.56      0.50      0.49      6627
weighted avg       0.93      0.96      0.94      6627

Reporte para modelo <class 'sklearn.svm.classes.SVC'> y vector objetivo available


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.00      0.00      0.00       249

   micro avg       0.96      0.96      0.96      6627
   macro avg       0.48      0.50      0.49      6627
weighted avg       0.93      0.96      0.94      6627

Reporte para modelo <class 'sklearn.tree.tree.DecisionTreeClassifier'> y vector objetivo available
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      6378
           1       0.06      0.10      0.07       249

   micro avg       0.91      0.91      0.91      6627
   macro avg       0.51      0.52      0.51      6627
weighted avg       0.93      0.91      0.92      6627

Reporte para modelo <class 'sklearn.linear_model.logistic.LogisticRegression'> y vector objetivo available
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6378
           1       0.00      0.

  'precision', 'predicted', average, warn_for)


### Parte 3: Exportación de predicciones

#### Ingestar la tabla de testing mediante psycopg2 para la posterior predicción del modelo.

In [12]:
cursor = conn.cursor()
cursor.execute("SELECT * FROM test_cupid;")
data_test = cursor.fetchall()
df_test = pd.DataFrame(list(data_test))
df_test.columns = [desc[0] for desc in cursor.description]
cursor.close()
conn.close()
df_test.head(5)

Unnamed: 0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying_to_quit,smokes_when_drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
0,22,75.0,0,0,0,0,0,0,1,0,...,1,1,1,0,0,0,0,0,0,1
1,32,65.0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,24,67.0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1
3,29,62.0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
4,39,65.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [13]:
pickles = {}
for target in targets:
    pickles[target] = []

for filepath in glob.glob(os.getcwd() + '/*.pkl'):
    filename = os.path.basename(filepath)
    for target in targets:
        if target in filename:
            pickles[target].append(filepath)

#### En base a los objetos serializados, predecir y evaluar cuatro queries específicas:
* Query 1: 'atheism', 'asian', 'employed', 'pro_dogs', 'chinese'
* Query 2: 'income_over_75', 'french', 'german','orientation_straight', 'new york'
* Query 3: 'education_undergrad_university', 'body_type_regular', 'pro_dogs', 'employed'
* Query 4: 'taurus', 'indian', 'washington', 'income_between_50_75', 'hinduism'

#### Cada una de estas queries específicas debe ser registrada en la base de datos.

In [14]:
queries = {
    "query1": ['atheism', 'asian', 'employed', 'pro_dogs', 'chinese'],
    "query2": ["income_over_75", "french", "german", "orientation_straight", "new_york"],
    "query3": ["education_undergrad_university", "body_type_regular", "pro_dogs", "employed"],
    "query4": ["taurus", "indian", "washington", "income_between_50_75", "hinduism"]
}
engine = create_engine('postgresql://postgres:password@localhost/valdes_julio')
for query in queries:
    for target in pickles:
        X = df_test.copy()
        obj_vect = X[target]
        X = X.drop(columns=targets)
        for pickled_model in pickles[target]:
            filename = os.path.basename(pickled_model)
            filename_split = filename.split("_")
            modelname = filename_split[len(filename_split)-2]
            df_table = helpers.create_crosstab(pickled_model, X, obj_vect, queries[query])
            df_table.to_sql("{}_{}_{}".format(query, target, modelname), con=engine, if_exists="replace")