# Preparación de datos

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [2]:
spark

In [4]:
##
## Row representa una fila en un RDD
##
from pyspark.sql import Row
##
## Crea un DataFrame a partir del archivo con
## formato CSV
##
nombre_archivo="evaluacion"
df = spark.read.load("D:\\CLASES\\ELECTIVA 3 BigData\\Taller_final\\"+nombre_archivo+".csv",
                     format="csv",
                     sep="|",
                     inferSchema= True,
                     encoding='UTF-8',
                     decimal=".",
                     header="true")
df.printSchema()

root
 |-- cod_alumno: integer (nullable = true)
 |-- ciudad: string (nullable = true)
 |-- sexo_f: integer (nullable = true)
 |-- puntaje_ingreso: double (nullable = true)
 |-- promedio_notas: double (nullable = true)
 |-- total_creditos_logrados: double (nullable = true)
 |-- biologia: double (nullable = true)
 |-- matematicas: double (nullable = true)
 |-- filosofia: double (nullable = true)
 |-- fisica: double (nullable = true)
 |-- historia: double (nullable = true)
 |-- quimica: double (nullable = true)
 |-- lenguaje: double (nullable = true)
 |-- geografia: double (nullable = true)
 |-- idiomas: double (nullable = true)
 |-- sociales: double (nullable = true)
 |-- lecturacritica: double (nullable = true)
 |-- ciencias: double (nullable = true)
 |-- razonamiento: double (nullable = true)
 |-- competenciaciud: double (nullable = true)
 |-- valor_matricula: double (nullable = true)
 |-- valor_servicios: double (nullable = true)
 |-- ingresos: double (nullable = true)
 |-- vive_famil

# Contar cantidad de registros

In [5]:
df.count()

264

# Contar cantidad de registros por clase

In [6]:
df.groupby("clase").count().show()

+-----+-----+
|clase|count|
+-----+-----+
| null|  264|
+-----+-----+



# Contar estudiantes por ciudad

In [7]:
df.groupby("ciudad").count().show()

+--------------------+-----+
|              ciudad|count|
+--------------------+-----+
|TUMACO           ...|   32|
|PASTO            ...|  193|
|IPIALES          ...|   30|
|TUQUERRES        ...|    9|
+--------------------+-----+



# Contar estudiantes por año

In [8]:
df.groupby("anio_reporte").count().orderBy("anio_reporte").show()

+------------+-----+
|anio_reporte|count|
+------------+-----+
|      2018.0|  264|
+------------+-----+



# Explorar valores faltantes

In [9]:
from pyspark.sql.functions import isnan, when, count, col
Dict_Null = {col:df.filter(df[col].isNull()).count() for col in df.columns}
Dict_Null

{'cod_alumno': 0,
 'ciudad': 0,
 'sexo_f': 0,
 'puntaje_ingreso': 0,
 'promedio_notas': 1,
 'total_creditos_logrados': 1,
 'biologia': 3,
 'matematicas': 3,
 'filosofia': 3,
 'fisica': 3,
 'historia': 3,
 'quimica': 4,
 'lenguaje': 4,
 'geografia': 4,
 'idiomas': 5,
 'sociales': 13,
 'lecturacritica': 129,
 'ciencias': 129,
 'razonamiento': 129,
 'competenciaciud': 129,
 'valor_matricula': 11,
 'valor_servicios': 11,
 'ingresos': 11,
 'vive_familia': 0,
 'estrato': 12,
 'estrato_vivira': 12,
 'actualmente_trabaja': 4,
 'valor_matric_colegio': 4,
 'ano_pago_colegio': 11,
 'ingresos_familiares': 4,
 'ano_ingresos': 11,
 'numero_aportantes': 11,
 'numero_hermanos': 4,
 'numero_hermanos_est_superior': 4,
 'edad_inicio': 0,
 'edad_reporte': 0,
 'anio_reporte': 0,
 'clase': 264}

In [10]:
for col_name in df.dtypes:
    df.describe(col_name[0]).show()

+-------+-------------------+
|summary|         cod_alumno|
+-------+-------------------+
|  count|                264|
|   mean|9.203732674545455E8|
| stddev|9.448490349661958E8|
|    min|               7644|
|    max|         2141601270|
+-------+-------------------+

+-------+--------------------+
|summary|              ciudad|
+-------+--------------------+
|  count|                 264|
|   mean|                null|
| stddev|                null|
|    min|IPIALES          ...|
|    max|TUQUERRES        ...|
+-------+--------------------+

+-------+-------------------+
|summary|             sexo_f|
+-------+-------------------+
|  count|                264|
|   mean|0.20833333333333334|
| stddev|0.40688778297473344|
|    min|                  0|
|    max|                  1|
+-------+-------------------+

+-------+------------------+
|summary|   puntaje_ingreso|
+-------+------------------+
|  count|               264|
|   mean| 60.23227272727273|
| stddev|14.921546882109602|
|   

+-------+------------------+
|summary|       edad_inicio|
+-------+------------------+
|  count|               264|
|   mean|19.522727272727273|
| stddev| 3.876351051784115|
|    min|               0.0|
|    max|              40.0|
+-------+------------------+

+-------+------------------+
|summary|      edad_reporte|
+-------+------------------+
|  count|               264|
|   mean|23.617424242424242|
| stddev| 5.632051084986429|
|    min|               1.0|
|    max|              48.0|
+-------+------------------+

+-------+------------+
|summary|anio_reporte|
+-------+------------+
|  count|         264|
|   mean|      2018.0|
| stddev|         0.0|
|    min|      2018.0|
|    max|      2018.0|
+-------+------------+

+-------+-----+
|summary|clase|
+-------+-----+
|  count|    0|
|   mean| null|
| stddev| null|
|    min| null|
|    max| null|
+-------+-----+



# Métodos para autocompletar los valores faltantes

1 imputar valores faltantes con un cero

In [11]:
complete_dict = dict()
for col_name in df.dtypes:
    if col_name[1]=='string':
        complete_dict[col_name[0]]="DESCONOCIDO"
    else:
        complete_dict[col_name[0]]=0    
df_ml_complete = df.na.fill(complete_dict)      


2 imputar valores faltantes con un valor fuera de rango -99

In [12]:
complete_dict = dict()
for col_name in df.dtypes:
    if col_name[1]=='string':
        complete_dict[col_name[0]]="DESCONOCIDO"
    else:
        complete_dict[col_name[0]]=-99    
df_ml_complete = df.na.fill(complete_dict)      


3 imputar valores faltantes con el promedio

In [None]:
complete_dict = dict()
for col_name in df.dtypes:
    if col_name[1]=='string':
        complete_dict[col_name[0]]="DESCONOCIDO"
    else:
        d=df.selectExpr("avg("+col_name[0]+")").rdd.map(list)
        v=max(d.collect()[0])
        complete_dict[col_name[0]]=v   
df_ml_complete = df.na.fill(complete_dict)      

In [13]:
from pyspark.sql.functions import isnan, when, count, col
Dict_Null = {col:df_ml_complete.filter(df_ml_complete[col].isNull()).count() for col in df_ml_complete.columns}
Dict_Null

{'cod_alumno': 0,
 'ciudad': 0,
 'sexo_f': 0,
 'puntaje_ingreso': 0,
 'promedio_notas': 0,
 'total_creditos_logrados': 0,
 'biologia': 0,
 'matematicas': 0,
 'filosofia': 0,
 'fisica': 0,
 'historia': 0,
 'quimica': 0,
 'lenguaje': 0,
 'geografia': 0,
 'idiomas': 0,
 'sociales': 0,
 'lecturacritica': 0,
 'ciencias': 0,
 'razonamiento': 0,
 'competenciaciud': 0,
 'valor_matricula': 0,
 'valor_servicios': 0,
 'ingresos': 0,
 'vive_familia': 0,
 'estrato': 0,
 'estrato_vivira': 0,
 'actualmente_trabaja': 0,
 'valor_matric_colegio': 0,
 'ano_pago_colegio': 0,
 'ingresos_familiares': 0,
 'ano_ingresos': 0,
 'numero_aportantes': 0,
 'numero_hermanos': 0,
 'numero_hermanos_est_superior': 0,
 'edad_inicio': 0,
 'edad_reporte': 0,
 'anio_reporte': 0,
 'clase': 0}

In [14]:
for col_name in df_ml_complete.dtypes:
    df_ml_complete.describe(col_name[0]).show()

+-------+-------------------+
|summary|         cod_alumno|
+-------+-------------------+
|  count|                264|
|   mean|9.203732674545455E8|
| stddev|9.448490349661958E8|
|    min|               7644|
|    max|         2141601270|
+-------+-------------------+

+-------+--------------------+
|summary|              ciudad|
+-------+--------------------+
|  count|                 264|
|   mean|                null|
| stddev|                null|
|    min|IPIALES          ...|
|    max|TUQUERRES        ...|
+-------+--------------------+

+-------+-------------------+
|summary|             sexo_f|
+-------+-------------------+
|  count|                264|
|   mean|0.20833333333333334|
| stddev|0.40688778297473344|
|    min|                  0|
|    max|                  1|
+-------+-------------------+

+-------+------------------+
|summary|   puntaje_ingreso|
+-------+------------------+
|  count|               264|
|   mean| 60.23227272727273|
| stddev|14.921546882109602|
|   

+-------+-------------------+
|summary|  numero_aportantes|
+-------+-------------------+
|  count|                264|
|   mean|-2.7007575757575757|
| stddev| 20.143341826921777|
|    min|              -99.0|
|    max|                8.0|
+-------+-------------------+

+-------+------------------+
|summary|   numero_hermanos|
+-------+------------------+
|  count|               264|
|   mean|               0.5|
| stddev|12.568897577424488|
|    min|             -99.0|
|    max|              17.0|
+-------+------------------+

+-------+----------------------------+
|summary|numero_hermanos_est_superior|
+-------+----------------------------+
|  count|                         264|
|   mean|         -1.1174242424242424|
| stddev|          12.252936304947411|
|    min|                       -99.0|
|    max|                        13.0|
+-------+----------------------------+

+-------+------------------+
|summary|       edad_inicio|
+-------+------------------+
|  count|               264|

# Transformar la ciudad a Dummies

In [15]:
df_ml_complete.createOrReplaceTempView("estudiantes")
sql=("select case when trim(e.ciudad)='LA UNION' then 1 else 0 end ciudad_union,"+
     "case when trim(e.ciudad)='TUMACO' then 1 else 0 end ciudad_tumaco,"+
     "case when trim(e.ciudad)='PASTO' then 1 else 0 end ciudad_pasto,"+
     "case when trim(e.ciudad)='IPIALES' then 1 else 0 end ciudad_ipiales,"+
     "case when trim(e.ciudad)='SAMANIEGO' then 1 else 0 end ciudad_samaniego,"+
     "e.*"+
     "from estudiantes e"
    )

df_procesado=spark.sql(sql)
df_procesado=df_procesado.drop("ciudad")
df_procesado.printSchema()

root
 |-- ciudad_union: integer (nullable = false)
 |-- ciudad_tumaco: integer (nullable = false)
 |-- ciudad_pasto: integer (nullable = false)
 |-- ciudad_ipiales: integer (nullable = false)
 |-- ciudad_samaniego: integer (nullable = false)
 |-- cod_alumno: integer (nullable = false)
 |-- sexo_f: integer (nullable = false)
 |-- puntaje_ingreso: double (nullable = false)
 |-- promedio_notas: double (nullable = false)
 |-- total_creditos_logrados: double (nullable = false)
 |-- biologia: double (nullable = false)
 |-- matematicas: double (nullable = false)
 |-- filosofia: double (nullable = false)
 |-- fisica: double (nullable = false)
 |-- historia: double (nullable = false)
 |-- quimica: double (nullable = false)
 |-- lenguaje: double (nullable = false)
 |-- geografia: double (nullable = false)
 |-- idiomas: double (nullable = false)
 |-- sociales: double (nullable = false)
 |-- lecturacritica: double (nullable = false)
 |-- ciencias: double (nullable = false)
 |-- razonamiento: doubl

# Aplicar Modelo Entrenado y Clasificar

Cargar modelo y variables

In [16]:
import joblib
from pyspark.ml.classification import DecisionTreeClassifier,DecisionTreeClassificationModel



modelo = DecisionTreeClassificationModel.load('D:\\CLASES\\ELECTIVA 3 BigData\\Taller_final\\modelo_99_arbol')
important_features=joblib.load('D:\\CLASES\\ELECTIVA 3 BigData\\Taller_final\\columnas_99.dat')



Filtrar variables

In [21]:
from pyspark.ml.feature import VectorAssembler

assembler_new = VectorAssembler(inputCols=important_features, outputCol='features')
evaluar = assembler_new.transform(df_procesado)


evaluar.select('features').take(1)
evaluar.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[48.0,2.610869565...|
|[33.0,2.588421052...|
|[39.0,2.733663366...|
|[36.0,2.941489361...|
|[33.0,3.150000000...|
+--------------------+
only showing top 5 rows



Claficiar

In [27]:
df_pred = modelo.transform(evaluar)
df_pred.printSchema()

root
 |-- ciudad_union: integer (nullable = false)
 |-- ciudad_tumaco: integer (nullable = false)
 |-- ciudad_pasto: integer (nullable = false)
 |-- ciudad_ipiales: integer (nullable = false)
 |-- ciudad_samaniego: integer (nullable = false)
 |-- cod_alumno: integer (nullable = false)
 |-- sexo_f: integer (nullable = false)
 |-- puntaje_ingreso: double (nullable = false)
 |-- promedio_notas: double (nullable = false)
 |-- total_creditos_logrados: double (nullable = false)
 |-- biologia: double (nullable = false)
 |-- matematicas: double (nullable = false)
 |-- filosofia: double (nullable = false)
 |-- fisica: double (nullable = false)
 |-- historia: double (nullable = false)
 |-- quimica: double (nullable = false)
 |-- lenguaje: double (nullable = false)
 |-- geografia: double (nullable = false)
 |-- idiomas: double (nullable = false)
 |-- sociales: double (nullable = false)
 |-- lecturacritica: double (nullable = false)
 |-- ciencias: double (nullable = false)
 |-- razonamiento: doubl

In [72]:
df_pred=df_pred.withColumn("probabilidad",df_pred["probability"].cast("String"))


In [103]:
df_pred.createOrReplaceTempView('tabla_pred')
df_eval=spark.sql("SELECT cod_alumno,prediction,"+
          "cast(replace(split(probabilidad,',')[0],'[','')as double) as probabilidad_0,"+
          "cast(replace(split(probabilidad,',')[1],']','')as double) as probabilidad_1 FROM tabla_pred")
df_eval.show()

+----------+----------+-------------------+-------------------+
|cod_alumno|prediction|     probabilidad_0|     probabilidad_1|
+----------+----------+-------------------+-------------------+
|      7644|       0.0| 0.9393939393939394|0.06060606060606061|
|  22155233|       0.0|               0.75|               0.25|
|  23033258|       1.0|0.03289473684210526| 0.9671052631578947|
|  23038230|       0.0| 0.9166666666666666|0.08333333333333333|
|  23038265|       0.0| 0.9117647058823529|0.08823529411764706|
|  25033227|       1.0|0.03289473684210526| 0.9671052631578947|
|  25033248|       0.0| 0.9024390243902439| 0.0975609756097561|
|  25033264|       0.0| 0.9024390243902439| 0.0975609756097561|
|  25155254|       1.0|0.03289473684210526| 0.9671052631578947|
|  25155257|       1.0|0.03289473684210526| 0.9671052631578947|
|  26033320|       1.0|0.03289473684210526| 0.9671052631578947|
|  26160247|       1.0|0.08108108108108109|  0.918918918918919|
|  27148234|       0.0|               0.

In [104]:
df_eval.printSchema()

root
 |-- cod_alumno: integer (nullable = false)
 |-- prediction: double (nullable = false)
 |-- probabilidad_0: double (nullable = true)
 |-- probabilidad_1: double (nullable = true)



In [105]:
df_eval.count()

264

# Contar cantidad de registros por clase predicha

In [106]:
df_eval.groupby("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  100|
|       1.0|  164|
+----------+-----+



In [107]:
df_eval.describe().show()

+-------+-------------------+------------------+--------------------+--------------------+
|summary|         cod_alumno|        prediction|      probabilidad_0|      probabilidad_1|
+-------+-------------------+------------------+--------------------+--------------------+
|  count|                264|               264|                 264|                 264|
|   mean|9.203732674545455E8|0.6212121212121212| 0.37631197375886194|   0.623688026241137|
| stddev|9.448490349661958E8|0.4860065096859095| 0.43269583886000157|  0.4326958388600013|
|    min|               7644|               0.0|0.003711952487008166|0.017699115044247787|
|    max|         2141601270|               1.0|  0.9823008849557522|  0.9962880475129918|
+-------+-------------------+------------------+--------------------+--------------------+



In [108]:
df_eval.write.csv('D:\\CLASES\\ELECTIVA 3 BigData\\Taller_final\\entrega_guerrero.csv',sep='|',header="true")