## Tabla de contenidos

1. [Primer modelo IAM - Rendimiento - 1000 personas](#Primer-modelo-IAM-Rendimiento-1000-personas)
2. [Primer modelo IAM - Rendimiento - 7000 personas](#Primer-modelo-IAM-Rendimiento-7000-personas)
3. [Primer modelo IAM - ROC](#Primer-modelo-IAM-ROC)
4. [Segundo modelo IAM - Rendimiento - 1000 personas](#Segundo-modelo-IAM-Rendimiento-1000-personas)
5. [Segundo modelo IAM - Rendimiento - 7000 personas](#Segundo-modelo-IAM-Rendimiento-7000-personas)
6. [Segundo modelo IAM - ROC](#Segundo-modelo-IAM-ROC)
7. [Tercer modelo IAM - Rendimiento - 1000 personas](#Tercer-modelo-IAM-Rendimiento-1000-personas)
8. [Tercer modelo IAM - Rendimiento - 7000 personas](#Tercer-modelo-IAM-Rendimiento-7000-personas)
9. [Tercer modelo IAM - ROC](#Tercer-modelo-IAM-ROC)

# Acreditación Scores

In [1]:
# Create spark context
# Only on Windows
import os
import sys

spark_path = 'C:\spark-2.0.1-bin-hadoop2.7'

os.environ['SPARK_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.3-src.zip")

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

sc = SparkContext("local", "score_acreditation")
sqlContext = SQLContext(sc)

In [1]:
# General config
separator = '\t'

In [2]:
# with full file
from pyspark.sql.types import *

schema = StructType([
    # Riesgo cardio vascular    
    StructField("id_afiliado", IntegerType(), False),
    StructField("period", DateType(), True),    
    StructField("predict_pred_aaat_rcv", StringType(), True),
    StructField("x0_c_pred_aaat_rcv", DoubleType(), True),
    StructField("x1_c_pred_aaat_rcv", DoubleType(), True),
    StructField("th_pred_pred_aaat_rcv", StringType(), True),
    StructField("th_deafault_pred_pred_aaat_rcv", StringType(), True),         
    
    # Infarto agudo del miocardio    
    StructField("predict_pred_aaat_iam", StringType(), True),
    StructField("x0_c_pred_aaat_iam", DoubleType(), True),
    StructField("x1_c_pred_aaat_iam", DoubleType(), True),    
    StructField("th_pred_pred_aaat_iam", DoubleType(), True),
    StructField("th_deafault_pred_pred_aaat_iam", DoubleType(), True),                
    
    # Enfermedad Renal estadio 5    
    StructField("predict_pred_aaat_er5", StringType(), True),
    StructField("x0_c_pred_aaat_er5", DoubleType(), True),
    StructField("x1_c_pred_aaat_er5", DoubleType(), True),   
    StructField("th_pred_pred_aaat_er5", DoubleType(), True),
    StructField("th_deafault_pred_pred_aaat_er5", DoubleType(), True),       
        
    # Muerte    
    StructField("predict_pred_aaat_mue", StringType(), True),
    StructField("x0_c_pred_aaat_mue", DoubleType(), True),
    StructField("x1_c_pred_aaat_mue", DoubleType(), True),    
    StructField("th_pred_pred_aaat_mue", DoubleType(), True),
    StructField("th_deafault_pred_pred_aaat_mue", StringType(), True),    
        
    StructField("fecha_pred_generacion_scores", DateType(), True)    
])

In [3]:
from pyspark.sql.types import *

schema = StructType([
    # Infarto agudo del miocardio  
    StructField("id_afiliado", IntegerType(), False),
    StructField("period", DateType(), True),        
    
    # Infarto agudo del miocardio    
    StructField("predict_pred_aaat_iam", StringType(), True),
    StructField("x0_c_pred_aaat_iam", DoubleType(), True),
    StructField("x1_c_pred_aaat_iam", DoubleType(), True),    
    StructField("th_pred_pred_aaat_iam", DoubleType(), True),
    StructField("th_deafault_pred_pred_aaat_iam", DoubleType(), True),                 
        
    StructField("fecha_pred_generacion_scores", DateType(), True)    
])

In [4]:
# Schema complications
schema_complications = StructType([  
    StructField("id_afiliado", IntegerType(), False),
    StructField("fecha_complicacion", StringType(), True),    
    StructField("tipo_complicacion", StringType(), True),
    StructField("complicacion", StringType(), True)    
])

In [5]:
# Read file convert into dataframe
def read_file_df(file_name, schema):
    df = (sqlContext.read.format('com.databricks.spark.csv')
                    .options(header='true', delimiter=separator)
                    .load(file_name, schema = schema))
    return df

In [6]:
# Parse date to date format
from pyspark.sql.functions import udf
from datetime import datetime

parser_to_date =  udf (lambda x: datetime.strptime(x, '%d/%m/%Y'), DateType())

In [7]:
# Read complications - section: Octuber 02th 2016
# On linux
path = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/TRA_T_Complicaciones_2016_10_02.csv'
# On windows
#path = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/TRA_T_Complicaciones_2016_10_02.csv'
df_complications = read_file_df(path, schema_complications)
df_complications = df_complications.withColumn('fecha_complicacion', parser_to_date(df_complications['fecha_complicacion']))
df_complications.cache()
df_complications.count()

342515

In [8]:
udf_get_label =  udf(lambda x: '1_C' if x != None else '0_C', StringType())

In [9]:
# Unit test
from pyspark.sql.functions import col, desc
(df_complications.filter(col('id_afiliado') == 1647197)
                 .filter(col('complicacion') == 'IAM')
                 .orderBy(desc('fecha_complicacion'))
                 .show(1))

+-----------+------------------+-----------------+------------+
|id_afiliado|fecha_complicacion|tipo_complicacion|complicacion|
+-----------+------------------+-----------------+------------+
|    1647197|        2015-05-31|  Desenlace final|         IAM|
+-----------+------------------+-----------------+------------+
only showing top 1 row



In [10]:
udf_binarize_label =  udf(lambda x: 1 if x == '1_C' else 0, IntegerType())

In [11]:
def generate_analysis(columns, limit, number_months_to_analyze, 
                      columns_merge, debug, order_desc, complicacion, tipo_complicacion, image_path, image_file_name,
                      plot_title):
    import pandas as pd
    import datetime
    from dateutil.relativedelta import relativedelta
    from pyspark.sql.functions import col, desc
    import plotly.plotly as py
    import plotly.graph_objs as go
    py.plotly.tools.set_credentials_file(username='jreyro', api_key='vizy9pl3ds')
    
    months = pd.date_range(start=pd.datetime(2015, 1, 1),periods=number_months_to_analyze,freq='M')
    
    if (debug == 'Y'):
        print(months)
    
    df_final_scores_pandas = pd.DataFrame(columns=['date', '0_c', '1_c'])
    
    # Iterate number of months
    for month in months:
    
        month_parsed = month.strftime('%Y-%m-%d')
        final_date = month.date() + relativedelta(months=13)
        print('month_parsed: ',month_parsed)
        print('final_date: ',final_date)
        df_iam = (df.select(columns)
                    .filter(col('period') == month_parsed)
                    .orderBy(desc(order_desc))
                    .limit(limit)
                 )
        
        # Testing purposes
        if (debug == 'Y'):
            df_iam.show(5)
        
        # create complications aggregate
        df_complications_aggr = (df_complications
                                 .filter(col('tipo_complicacion') == tipo_complicacion)
                                 .filter(col('complicacion') == complicacion)
                                 .filter(col('fecha_complicacion') > month_parsed)
                                 .filter(col('fecha_complicacion') < final_date)
                                 .groupBy('id_afiliado')
                                 .count())
        if (debug == 'Y'):
            df_complications_aggr.show(5)
        
        # Merge with complications
        cond = [df_iam.id_afiliado == df_complications_aggr.id_afiliado]
        merge_predict_label = (df_iam.join(df_complications_aggr, cond, 'left')
                               .drop(df_complications_aggr.id_afiliado))
        if (debug == 'Y'):
            merge_predict_label.select(columns_merge).show(5)
    
        # Add label column
        merge_predict_label = merge_predict_label.withColumn('label', udf_get_label('count'))
        if (debug == 'Y'):
            merge_predict_label.show(5)
            
        # Transform to pandas
        predict_label_pandas = merge_predict_label.groupBy('label').count().toPandas()
        if (debug == 'Y'):
            print(predict_label_pandas)    
            
        # Aggregate to final dataset
        zero_datum = predict_label_pandas[predict_label_pandas['label'] == '0_C']['count'][0]
        one_datum = predict_label_pandas[predict_label_pandas['label'] == '1_C']['count'][1]
        row = [month_parsed, zero_datum, one_datum]
        if (debug == 'Y'):
            print(row)
        df_final_scores_pandas.loc[len(df_final_scores_pandas)] = row
        if (debug == 'Y'):
            print(df_final_scores_pandas)   
        
        # Bar plot performance prediction
        fig = get_plot_performance_prediction(df_final_scores_pandas, limit, plot_title)
        if (debug == 'Y'):
            print(fig)
    
        # Binarize target
        merge_predict_label = merge_predict_label.withColumn('label_b', udf_binarize_label('label'))
        merge_predict_label = merge_predict_label.withColumn('prediction_b', udf_binarize_label('predict_pred_aaat_iam'))
        results = merge_predict_label.select('prediction_b', 'label_b')
        results_pandas = results.toPandas()
        if (debug == 'Y'):
            print(results_pandas)
        return fig, results_pandas

In [36]:
def get_plot_performance_prediction(df_final_scores_pandas, limit, plot_title):
    
    #import plotly.plotly as py
    #import plotly.graph_objs as go
    #py.plotly.tools.set_credentials_file(username='jreyro', api_key='vizy9pl3ds')
    
    from plotly import __version__
    from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

    
    print('Getting plot')
    trace1 = go.Bar(x=list(df_final_scores_pandas['date']),
                    y=list(df_final_scores_pandas['0_c']),
                    name='0_C'
                   )

    trace2 = go.Bar(x=list(df_final_scores_pandas['date']),
                    y=list(df_final_scores_pandas['1_c']),
                    name='1_C'
                   )

    data = [trace1, trace2]
    
    # Anotations with percentaje
    dates = list(df_final_scores_pandas['date'])
    positives = list(df_final_scores_pandas['1_c'])
    new_positive = [int((x / limit) * 100) for x in positives]
    
    annotations=[dict(x=xi,y=yi,
                 text=str(yi) + '%',
                 xanchor='center',
                 yanchor='bottom',
                 showarrow=False,
                ) for xi, yi in zip(dates, new_positive)] 

    layout = go.Layout(annotations = annotations,
                       barmode='stack',
                       title=plot_title
                      )
    
    fig = go.Figure(data=data, layout=layout)
    return fig

In [29]:
def get_plot_roc(false_positive_rate, true_positive_rate):
    from plotly import __version__
    from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
    
    #import plotly.plotly as py
    #import plotly.graph_objs as go

    x = false_positive_rate
    y = true_positive_rate

    # Create a trace
    trace = go.Scatter(
               x=x,
               y=y
            )

    data = [trace]

    layout = go.Layout(
                title='Receiver Operating Characteristic',
                xaxis=dict(
                           title='True Positive Rate'
                          ),
                yaxis=dict(
                           title='False Positive Rate'
                          ),
                annotations=[
                             dict(
                                  x=0.9,
                                  y=0.2,
                                  xref='x',
                                  yref='y',
                                  text='Roc curve (area = %0.2f)' % roc_auc,
                                  showarrow=False
                                 )
                            ],
                shapes=[
                        dict(
                             type='line',
                             x0=0,
                             y0=0,
                             x1=1,
                             y1=1,
                             line=dict(
                                       color='rgb(255,140,0)',
                                       dash='dot'
                                      )
                            )
                       ]
             )

    fig = go.Figure(data=data, layout=layout)
    return fig

In [30]:
import time
py.plotly.tools.set_credentials_file(username='jreyro', api_key='vizy9pl3ds')  
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# On Linux
file = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1037_iam_1.csv'

# On Windows
#file = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1037_iam_1.csv'

# Columns to project in select clause - IAM
columns = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam']
# Columns for merge
columns_merge = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam','count']
order_desc = 'x1_c_pred_aaat_iam'
complicacion = 'IAM'
tipo_complicacion = 'Desenlace final'
limit = 1000
number_months_to_analyze = 1
image_path = '/home/jasam/motor/sura_acreditacion_scores/images/'
index = 1    
  
start_time = time.time()
print('File: ', file)
#Read file
df = read_file_df(file, schema)
df.cache()
print('Rows: ', df.count())
    
# Do data analysis!
image_file_name = 'IAM_bar_plot_' + str(index) + '.png'
plot_title = 'Efectividad scores IAM - 1000 personas'
fig, results_pandas = generate_analysis(columns, limit, 
                  number_months_to_analyze, columns_merge, 
                  'N', order_desc, complicacion, tipo_complicacion, image_path, 
                  image_file_name, plot_title)
# Roc curve
actual = results_pandas['label_b'].values
predictions = results_pandas['prediction_b'].values
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("--- Seconds lapsed: %s ---" % (time.time() - start_time))

File:  /home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1037_iam_1.csv
Rows:  70797
month_parsed:  2015-01-31
final_date:  2016-02-29
Getting plot
--- Seconds lapsed: 1.563199758529663 ---


## Primer modelo IAM Rendimiento 1000 personas

[[ ir al indice ]](#Tabla-de-contenidos)

In [33]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
iplot(fig, filename='stacked-bar')

In [34]:
import time
import plotly.plotly as py
import plotly.graph_objs as go
py.plotly.tools.set_credentials_file(username='jreyro', api_key='vizy9pl3ds')  
from sklearn.metrics import roc_curve, auc

# On Linux
file = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1037_iam_1.csv'

# On Windows
#file = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1037_iam_1.csv'

# Columns to project in select clause - IAM
columns = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam']
# Columns for merge
columns_merge = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam','count']
order_desc = 'x1_c_pred_aaat_iam'
complicacion = 'IAM'
tipo_complicacion = 'Desenlace final'
limit = 7000
number_months_to_analyze = 1
image_path = '/home/jasam/motor/sura_acreditacion_scores/images/'
index = 1    
  
start_time = time.time()
print('File: ', file)
#Read file
df = read_file_df(file, schema)
df.cache()
print('Rows: ', df.count())
    
# Do data analysis!
image_file_name = 'IAM_bar_plot_' + str(index) + '.png'
plot_title = 'Efectividad scores IAM - 7000 personas'
fig, results_pandas = generate_analysis(columns, limit, 
                  number_months_to_analyze, columns_merge, 
                  'N', order_desc, complicacion, tipo_complicacion, image_path, 
                  image_file_name, plot_title)
# Roc curve
actual = results_pandas['label_b'].values
predictions = results_pandas['prediction_b'].values
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("--- Seconds lapsed: %s ---" % (time.time() - start_time))

File:  /home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1037_iam_1.csv
Rows:  70797
month_parsed:  2015-01-31
final_date:  2016-02-29
Getting plot
--- Seconds lapsed: 1.650287389755249 ---


## Primer modelo IAM Rendimiento 7000 personas

[[ ir al indice ]](#Tabla-de-contenidos)

In [35]:
iplot(fig, filename='stacked-bar')

In [37]:
import time
import plotly.plotly as py
import plotly.graph_objs as go
py.plotly.tools.set_credentials_file(username='jreyro', api_key='vizy9pl3ds')  
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# On Linux
file = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1037_iam_1.csv'

# On Windows
#file = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1037_iam_1.csv'

# Columns to project in select clause - IAM
columns = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam']
# Columns for merge
columns_merge = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam','count']
order_desc = 'x1_c_pred_aaat_iam'
complicacion = 'IAM'
tipo_complicacion = 'Desenlace final'
limit = 100000
number_months_to_analyze = 1
image_path = '/home/jasam/motor/sura_acreditacion_scores/images/'
index = 1    
  
start_time = time.time()
print('File: ', file)
#Read file
df = read_file_df(file, schema)
df.cache()
print('Rows: ', df.count())
    
# Do data analysis!
image_file_name = 'IAM_bar_plot_' + str(index) + '.png'
plot_title = 'Efectividad scores IAM - 7000 personas'
fig, results_pandas = generate_analysis(columns, limit, 
                  number_months_to_analyze, columns_merge, 
                  'N', order_desc, complicacion, tipo_complicacion, image_path, 
                  image_file_name, plot_title)
# Roc curve
actual = results_pandas['label_b'].values
predictions = results_pandas['prediction_b'].values
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("--- Seconds lapsed: %s ---" % (time.time() - start_time))

File:  /home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1037_iam_1.csv
Rows:  70797
month_parsed:  2015-01-31
final_date:  2016-02-29
Getting plot
--- Seconds lapsed: 2.432133913040161 ---


## Primer modelo IAM ROC

[[ ir al indice ]](#Tabla-de-contenidos)

In [38]:
fig = get_plot_roc(false_positive_rate, true_positive_rate)
iplot(fig, filename='basic_line')

In [40]:
import time
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn.metrics import roc_curve, auc

# On Linux
file = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1344_iam_2.csv'

# On Windows
#file = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1344_iam_2.csv'


# Columns to project in select clause - IAM
columns = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam']
# Columns for merge
columns_merge = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam','count']
order_desc = 'x1_c_pred_aaat_iam'
complicacion = 'IAM'
tipo_complicacion = 'Desenlace final'
limit = 1000
number_months_to_analyze = 1
image_path = '/home/jasam/motor/sura_acreditacion_scores/images/'
index = 1    
  
start_time = time.time()
print('File: ', file)
#Read file
df = read_file_df(file, schema)
df.cache()
print('Rows: ', df.count())
    
# Do data analysis!
image_file_name = 'IAM_bar_plot_' + str(index) + '.png'
plot_title = 'Efectividad scores IAM - 1000 personas'
fig, results_pandas = generate_analysis(columns, limit, 
                  number_months_to_analyze, columns_merge, 
                  'N', order_desc, complicacion, tipo_complicacion, image_path, 
                  image_file_name, plot_title)
# Roc curve
actual = results_pandas['label_b'].values
predictions = results_pandas['prediction_b'].values
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("--- Seconds lapsed: %s ---" % (time.time() - start_time))

File:  /home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1344_iam_2.csv
Rows:  70797
month_parsed:  2015-01-31
final_date:  2016-02-29
Getting plot
--- Seconds lapsed: 1.3593688011169434 ---


## Segundo modelo IAM Rendimiento 1000 personas

[[ ir al indice ]](#Tabla-de-contenidos)

In [41]:
iplot(fig, filename='stacked-bar')

In [42]:

import time
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn.metrics import roc_curve, auc

# On Linux
file = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1344_iam_2.csv'

# On Windows
#file = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1344_iam_2.csv'

# Columns to project in select clause - IAM
columns = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam']
# Columns for merge
columns_merge = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam','count']
order_desc = 'x1_c_pred_aaat_iam'
complicacion = 'IAM'
tipo_complicacion = 'Desenlace final'
limit = 7000
number_months_to_analyze = 1
image_path = '/home/jasam/motor/sura_acreditacion_scores/images/'
index = 1    
  
start_time = time.time()
print('File: ', file)
#Read file
df = read_file_df(file, schema)
df.cache()
print('Rows: ', df.count())
    
# Do data analysis!
image_file_name = 'IAM_bar_plot_' + str(index) + '.png'
plot_title = 'Efectividad scores IAM - 7000 personas'
fig, results_pandas = generate_analysis(columns, limit, 
                  number_months_to_analyze, columns_merge, 
                  'N', order_desc, complicacion, tipo_complicacion, image_path, 
                  image_file_name, plot_title)
# Roc curve
actual = results_pandas['label_b'].values
predictions = results_pandas['prediction_b'].values
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("--- Seconds lapsed: %s ---" % (time.time() - start_time))

File:  /home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1344_iam_2.csv
Rows:  70797
month_parsed:  2015-01-31
final_date:  2016-02-29
Getting plot
--- Seconds lapsed: 1.367189884185791 ---


## Segundo modelo IAM Rendimiento 7000 personas

[[ ir al indice ]](#Tabla-de-contenidos)

In [43]:
iplot(fig, filename='stacked-bar')

In [45]:

import time
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn.metrics import roc_curve, auc

# On Linux
file = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1344_iam_2.csv'

# On Windows
#file = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1344_iam_2.csv'

# Columns to project in select clause - IAM
columns = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam']
# Columns for merge
columns_merge = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam','count']
order_desc = 'x1_c_pred_aaat_iam'
complicacion = 'IAM'
tipo_complicacion = 'Desenlace final'
limit = 100000
number_months_to_analyze = 1
image_path = '/home/jasam/motor/sura_acreditacion_scores/images/'
index = 1    
  
start_time = time.time()
print('File: ', file)
#Read file
df = read_file_df(file, schema)
df.cache()
print('Rows: ', df.count())
    
# Do data analysis!
image_file_name = 'IAM_bar_plot_' + str(index) + '.png'
plot_title = 'Efectividad scores IAM - 1000 personas'
fig, results_pandas = generate_analysis(columns, limit, 
                  number_months_to_analyze, columns_merge, 
                  'N', order_desc, complicacion, tipo_complicacion, image_path, 
                  image_file_name, plot_title)
# Roc curve
actual = results_pandas['label_b'].values
predictions = results_pandas['prediction_b'].values
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("--- Seconds lapsed: %s ---" % (time.time() - start_time))

File:  /home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1344_iam_2.csv
Rows:  70797
month_parsed:  2015-01-31
final_date:  2016-02-29
Getting plot
--- Seconds lapsed: 2.186326265335083 ---


## Segundo modelo IAM ROC

[[ ir al indice ]](#Tabla-de-contenidos)

In [46]:
fig = get_plot_roc(false_positive_rate, true_positive_rate)
iplot(fig, filename='basic_line')

In [47]:
import time
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn.metrics import roc_curve, auc

# On Linux
file = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1420_iam_3.csv'

# On Windows
#file = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1420_iam_3.csv'

# Columns to project in select clause - IAM
columns = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam']
# Columns for merge
columns_merge = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam','count']
order_desc = 'x1_c_pred_aaat_iam'
complicacion = 'IAM'
tipo_complicacion = 'Desenlace final'
limit = 1000
number_months_to_analyze = 1
image_path = '/home/jasam/motor/sura_acreditacion_scores/images/'
index = 1    
  
start_time = time.time()
print('File: ', file)
#Read file
df = read_file_df(file, schema)
df.cache()
print('Rows: ', df.count())
    
# Do data analysis!
image_file_name = 'IAM_bar_plot_' + str(index) + '.png'
plot_title = 'Efectividad scores IAM - 1000 personas'
fig, results_pandas = generate_analysis(columns, limit, 
                  number_months_to_analyze, columns_merge, 
                  'N', order_desc, complicacion, tipo_complicacion, image_path, 
                  image_file_name, plot_title)
# Roc curve
actual = results_pandas['label_b'].values
predictions = results_pandas['prediction_b'].values
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("--- Seconds lapsed: %s ---" % (time.time() - start_time))

File:  /home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1420_iam_3.csv
Rows:  70797
month_parsed:  2015-01-31
final_date:  2016-02-29
Getting plot
--- Seconds lapsed: 1.7165801525115967 ---


## Tercer modelo IAM Rendimiento 1000 personas

[[ ir al indice ]](#Tabla-de-contenidos)

In [48]:
iplot(fig, filename='stacked-bar')

In [49]:
import time
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn.metrics import roc_curve, auc

# On Linux
file = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1420_iam_3.csv'

# On Windows
#file = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1420_iam_3.csv'

# Columns to project in select clause - IAM
columns = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam']
# Columns for merge
columns_merge = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam','count']
order_desc = 'x1_c_pred_aaat_iam'
complicacion = 'IAM'
tipo_complicacion = 'Desenlace final'
limit = 7000
number_months_to_analyze = 1
image_path = '/home/jasam/motor/sura_acreditacion_scores/images/'
index = 1    
  
start_time = time.time()
print('File: ', file)
#Read file
df = read_file_df(file, schema)
df.cache()
print('Rows: ', df.count())
    
# Do data analysis!
image_file_name = 'IAM_bar_plot_' + str(index) + '.png'
plot_title = 'Efectividad scores IAM - 7000 personas'
fig, results_pandas = generate_analysis(columns, limit, 
                  number_months_to_analyze, columns_merge, 
                  'N', order_desc, complicacion, tipo_complicacion, image_path, 
                  image_file_name, plot_title)
# Roc curve
actual = results_pandas['label_b'].values
predictions = results_pandas['prediction_b'].values
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("--- Seconds lapsed: %s ---" % (time.time() - start_time))

File:  /home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1420_iam_3.csv
Rows:  70797
month_parsed:  2015-01-31
final_date:  2016-02-29
Getting plot
--- Seconds lapsed: 1.4150738716125488 ---


## Tercer modelo IAM Rendimiento 7000 personas

[[ ir al indice ]](#Tabla-de-contenidos)

In [50]:
iplot(fig, filename='stacked-bar')

In [51]:
import time
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn.metrics import roc_curve, auc

# On Linux
file = '/home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1420_iam_3.csv'

# On Windows
#file = 'C:/Users/Javier/Documents/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1420_iam_3.csv'

# Columns to project in select clause - IAM
columns = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam']
# Columns for merge
columns_merge = ['id_afiliado', 'period', 'predict_pred_aaat_iam', 'x0_c_pred_aaat_iam', 'x1_c_pred_aaat_iam','count']
order_desc = 'x1_c_pred_aaat_iam'
complicacion = 'IAM'
tipo_complicacion = 'Desenlace final'
limit = 100000
number_months_to_analyze = 1
image_path = '/home/jasam/motor/sura_acreditacion_scores/images/'
index = 1    
  
start_time = time.time()
print('File: ', file)
#Read file
df = read_file_df(file, schema)
df.cache()
print('Rows: ', df.count())
    
# Do data analysis!
image_file_name = 'IAM_bar_plot_' + str(index) + '.png'
plot_title = 'Efectividad scores IAM - 1000 personas'
fig, results_pandas = generate_analysis(columns, limit, 
                  number_months_to_analyze, columns_merge, 
                  'N', order_desc, complicacion, tipo_complicacion, image_path, 
                  image_file_name, plot_title)
# Roc curve
actual = results_pandas['label_b'].values
predictions = results_pandas['prediction_b'].values
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)

print("--- Seconds lapsed: %s ---" % (time.time() - start_time))

File:  /home/jasam/motor/sura_acreditacion_scores/fuente_externa_csv/predicciones01Nov16_1420_iam_3.csv
Rows:  70797
month_parsed:  2015-01-31
final_date:  2016-02-29
Getting plot
--- Seconds lapsed: 2.1461970806121826 ---


## Tercer modelo IAM ROC

[[ ir al indice ]](#Tabla-de-contenidos)

In [52]:
fig = get_plot_roc(false_positive_rate, true_positive_rate)
iplot(fig, filename='basic_line')