In [None]:
displayHTML('<div style="text-align:center"><img src ="https://github.com/romulomadu/PEDS/blob/master/algebra/tarefas/logos.png?raw=true" /></div>')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
import scipy
import pandas as pd
import math
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from collections import OrderedDict

In [None]:
# File location and type
file_location = "/FileStore/tables/Churn_Modelling.csv"
file_type = "csv"

file_prods = ""

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_ = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

#df_.printSchema()

In [None]:
prod_df = df_.toPandas()
prod_df['Poup'] = prod_df.EstimatedSalary > 40000
prod_df['Prev'] = (prod_df.EstimatedSalary > 40000) & (prod_df.Age >30)
prod_df['Stocks'] = (prod_df.EstimatedSalary > 40000) & (prod_df.Age >30) & (prod_df.CreditScore > 700)
prod_df['PIC'] = ((prod_df.Age < 35) | (prod_df.Age > 60)) & (prod_df.EstimatedSalary < 40000)
prod_df['CDB'] = prod_df.Age > 50
prod_df['TD'] = prod_df.EstimatedSalary > 30000
prod_df['Micro'] = (prod_df.EstimatedSalary < 30000) & (prod_df.Gender == 'Female')
prod_df['CrCard'] = (prod_df.EstimatedSalary > 30000) & (prod_df.HasCrCard == 0)
prod_df['Univ'] = (prod_df.Age <= 23)
#prod_df['PortSal'] = (prod_df.Age > 23)
prods = ['Poup', 'Prev', 'Stocks', 'PIC', 'CDB', 'TD', 'Micro', 'CrCard', 'Univ']#, 'PortSal'] 
prod_df = prod_df[['CustomerId', 'Balance'] + prods]
prod_df['balProd'] = prod_df[prods].sum(axis=1).max()/prod_df[prods].sum(axis=1)
prod_df.loc[prod_df['balProd']==np.inf, 'balProd']=0
prod_df_old = prod_df.copy()

In [None]:
# A little randomness
def randomness(col, bias=0.5):
  rand_list = [(random.randint(0,10)/10.) - bias for i in range(10000)]
  return prod_df[[col]].add(pd.Series(rand_list), axis=0) >= 1
  
prod_df['PIC'] = randomness('PIC', -0.3)
prod_df['CDB'] = randomness('CDB', 0.3)
prod_df['TD'] = randomness('TD', 0.6)

for prod in prods:
  prod_df[prod] = randomness(prod)

# If balance is zero, there are no products
for col in prods:
  prod_df[col] = (prod_df[col]) & (prod_df.Balance > 0)

In [None]:
melt_prod_df = pd.melt(prod_df, ['CustomerId', 'Balance', 'balProd'])
melt_prod_df= melt_prod_df[melt_prod_df['value']].drop('value', axis=1)
#prod_df

In [None]:
numeric_features = [t[0] for t in df_.dtypes if t[1] == 'int']
numeric_data = df_.select(numeric_features).toPandas()
n = len(numeric_data.columns)
df = df_.select(
 'CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Exited')
cols = df.columns

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
categoricalColumns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']#, 'Exited']
stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
label_stringIdx = StringIndexer(inputCol = 'Exited', outputCol = 'label')
stages += [label_stringIdx]
numericCols = ['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'EstimatedSalary']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [None]:
cols = ['CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ['label', 'features'] + cols
df = df.select(selectedCols)

In [None]:
train, test = df.randomSplit([0.7, 0.3], seed = 2018)

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)

In [None]:
import matplotlib.pyplot as plt
plt.clf()
import numpy as np
beta = np.sort(lrModel.coefficients)
plt.plot(beta)
plt.ylabel('Beta Coefficients')
#display(plt.show())

In [None]:
plt.clf()
trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
#display(plt.show())
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

In [None]:
predictions = lrModel.transform(test)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)
predictions = dtModel.transform(test)
#predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10)

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)

In [None]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
predictions = gbtModel.transform(test)

In [None]:
dbutils.widgets.text(name='CustId_input', defaultValue='15634602', label='CustId')
custid_input= int(getArgument("CustId_input"))
plt.clf()
input_df = df_.filter('CustomerId == {}'.format(custid_input))
input_df = input_df.drop('CustomerId', 'RowNumber', 'Surname')
input_df = pipelineModel.transform(input_df)
#selectedCols = ['label', 'features'] + cols
#input_df2 = input_df.select(selectedCols)
pred_input = rfModel.transform(input_df)
prob_exit_input = round((pred_input.select('probability').toPandas()['probability'].loc[0][1])*100,2)
strategy = int(prob_exit_input/20) + 1
if strategy == 1:
  message = 'Não oferecer taxas diferenciadas'
elif strategy > 1:
  message = 'Ofereça taxas diferenciadas de nível {}'.format(strategy)

displayHTML("""<center><h1 style="font-family:verdana;">O cliente tem {}% de chances de sair do Banco</h1>
            <h3>{}</h3>
            </center>
            """.format(prob_exit_input, message))


In [None]:
custid_input= int(getArgument("CustId_input"))
pred_tot = gbtModel.transform(train)
probs = pred_tot.select('probability').toPandas()['probability']
probs = [probs.iloc[i][1] for i in range(len(probs))]
plt.clf()
fig, ax = plt.subplots()
plt.title('Probabilidade de saida dos clientes de acordo com o modelo')
hist_data = ax.hist(probs,bins=8)
x_hist = (hist_data[1]> prob_exit_input/100)
y_hist = hist_data[0]
plt.ylim([0,y_hist.max()*1.15])
height = y_hist[np.where(x_hist==False)[0][-1]]
ax.scatter(round(prob_exit_input/100,2), height*1.1, c='r', s=80)
display(plt.show())

# Recommender System
### Collaborative Filtering

In [None]:
interactions_train_df, interactions_test_df = train_test_split(melt_prod_df, 
                                   test_size=0.20,
                                   random_state=42)


In [None]:
interactions_full_df = melt_prod_df
interactions_full_indexed_df = melt_prod_df.set_index('CustomerId')
interactions_train_indexed_df = interactions_train_df.set_index('CustomerId')
interactions_test_indexed_df = interactions_test_df.set_index('CustomerId')

In [None]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['variable']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [None]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_full_df.pivot(index='CustomerId', 
                                                          columns='variable', 
                                                          values='balProd').fillna(0)

users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()
users_ids = list(users_items_pivot_matrix_df.index)
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 5
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()

In [None]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['variable'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'variable', 
                                                          right_on = 'variable')[['recStrength', 'variable']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, None)

In [None]:
custid_input= int(getArgument("CustId_input"))
try:
  renamer = {'variable': 'Produto', 'recPerc': '% de chance'}
  person_recs_df = cf_recommender_model.recommend_items(custid_input, 
                                                        get_items_interacted(custid_input, interactions_full_indexed_df))
  rec_strengths = person_recs_df[person_recs_df['recStrength'] >0 ]['recStrength']
  person_recs_df['recPerc'] = ((rec_strengths*100/rec_strengths.sum()))
  person_recs_df = person_recs_df.dropna(axis=0)
  person_recs_df = person_recs_df.drop('recStrength', axis=1).rename(index=str, columns=renamer)
  rec_html = person_recs_df.to_html(index=False)
except Exception:
  ord_dict = OrderedDict([('variable', ['Poup', 'CDB']), ('recPerc', [50, 50])])
  rec_html = pd.DataFrame.from_dict(ord_dict).rename(index=str, columns=renamer).to_html(index=False)

try:
  products_html = 'possui os produtos {}'.format(', '.join(get_items_interacted(custid_input, interactions_full_indexed_df)))
except KeyError:
  products_html = 'nao possui produtos'

displayHTML("""<center style="font-family:verdana;">O cliente {}<br>
Ofereça os produtos abaixo:<br> {}
</center>""".format(products_html, rec_html))

In [None]:
cf_recommender_model.recommend_items(custid_input, 
                                                        get_items_interacted(custid_input, interactions_full_indexed_df))