In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
import nltk.corpus
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

from sklearn.model_selection import train_test_split

In [None]:
os.chdir("/home/ikonkobo/Desktop/Self_Learning/telco_churn/")

# Data reading
The raw data are stored in the storage system S3 through SSP cloud.

In [None]:
comms_df = pd.read_sas("./data/raw/commsdata.sas7bdat", )

In [None]:
comms_df["verbatims"][0]

In [None]:
df_ = comms_df["verbatims"].to_frame()

In [None]:
df_["verbatims_process"] = (df_["verbatims"]
                        .apply(lambda x: x.decode("utf-8"))
                        .apply(lambda x : re.sub("[\,.?!]", "", x))
                        .apply(lambda x : x.lower()))

In [None]:
df_

In [None]:
LancasterStemmer()

In [None]:
df_.verbatims_process.apply(lambda x: LancasterStemmer().stem(x))

In [None]:
def sentence_tokenization(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))

In [None]:
# Create Dictionary
data_words = list(sentence_tokenization(df_.verbatims_process.apply(lambda x: LancasterStemmer().stem(x)).values.tolist()))
id2word = corpora.Dictionary(data_words)

In [None]:
df_occur = pd.DataFrame(id2word.dfs.items(), columns=["word_id", "p_occur"])
df_occur["word"] = df_occur.word_id.apply(lambda x: id2word[x])
df_occur["p_occur"] = df_occur.p_occur / id2word.num_docs
df_occur = df_occur[df_occur.p_occur > 0.02]
df_occur.sort_values('p_occur', ascending=False)

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(df_occur.word.to_list())

In [None]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

In [None]:
# remove stop words
data_words = remove_stopwords(data_words)
#print(data_words[:1][0][:30])

In [None]:
data_words

In [None]:
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View

In [None]:
from pprint import pprint
# number of topics
num_topics = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
#doc_lda = lda_model[corpus]

In [None]:
def extract_topic(ldamodel, topic):
    topic_word = [id2word[term[0]] for term in ldamodel.get_topic_terms(topic)]     
    return ', '.join(topic_word)

list_topics = [extract_topic(lda_model, i) for i in range(num_topics)]

In [None]:
list_topics

In [None]:
words = list(set([word for doc in comms_df["verbatims"].values for word in word_tokenize(doc.split())]))

In [None]:
df___ = comms_df["verbatims"].to_frame()
df___

In [None]:
for word in words:
       df___[word] = df___["verbatims"].apply(lambda x: x.count(word))

In [None]:
comms_df["verbatims"].values[0].split()

In [None]:
list(df__.columns)

In [None]:
comms_df

# Data preprocessing

The project focuses on churn prediction and the dataset has 2 potential target variables which are 
+ *churn* : indicates wheteher customer churned 
+ *upsell_xsell* : indicates customer’s flag for cross-sell or up-sell. 
Since we focus on churn prediction, *upsell_xsell* will be deleted. 

Furthermore, the variables listed below are useless for predictive modeling and will be rejected :
+ city
+ city_lat
+ city_long
+ data_usage_amt
+ mou_onnet_6m_normal
+ mou_roam_6m_normal
+ region_lat
+ region_long
+ state_lat
+ state_long
+ tweedie_adjusted

In addition, we notice that character variables are showed in this format b'prime'. We will suppress characters 'b' and '.

In [None]:
comms_df.drop(["upsell_xsell", "city", "city_lat", "city_long", "data_usage_amt", "mou_onnet_6m_normal", "mou_roam_6m_normal", "region_lat",
"region_long", "state_lat", "state_long", "tweedie_adjusted"], axis=1, inplace=True)

In [None]:
list_vars_object = list(comms_df.select_dtypes(exclude = ['int64', 'float64']).columns)

for var in list_vars_object:
    comms_df[var] = comms_df[var].apply(lambda x : x.decode("utf-8"))

In [None]:
df_val_mqtes = comms_df.isnull().sum().to_frame().reset_index()
df_val_mqtes.columns = ["variable", "nb_valeur_manquante"]
df_val_mqtes = df_val_mqtes[df_val_mqtes.nb_valeur_manquante > 0].reset_index(drop=True)
df_val_mqtes['pourcent_valeur_manquante'] = round(100 * df_val_mqtes['nb_valeur_manquante'] / comms_df.shape[0], 2)
df_val_mqtes = df_val_mqtes.sort_values('nb_valeur_manquante', ascending=False).reset_index(drop=True)


In [None]:
list(df_val_mqtes.variable)

In [None]:
l_ = [var for var in list(df_val_mqtes.variable) if len(comms_df[var].unique()) < 50]

l_

In [None]:
comms_df["tot_drpd_pr1"].value_counts(ascending=False).to_frame().reset_index().iloc[0, 0]

In [None]:
comms_df.values

In [None]:
sns.histplot(comms_df[df_val_mqtes.variable[4]])#, bins=100)
plt.show()

# Data splitting

In [None]:
comms_df.churn.describe()

The target variable churn seems to not have missing values. The dataset contains 12.13 % of customers churned. We will split the dataset to 70 % for training and 30% for test using *churn* as startify variable.

In [None]:
X = comms_df.drop(columns=["churn"])
y = comms_df.churn

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

In [None]:
train = X_train 
train["churn"] = y_train

test = X_test 
test["churn"] = y_test