## Warranty Data Analysis Autocar 
## Updated Approach using Regular Expressions

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os
print(os.listdir("../input"))

# Plotly based imports for visualization
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS  #very important to change language
from spacy.lang.en import English
!python -m spacy download es_core_news_sm

#Pandas display width
pd.set_option('max_colwidth',100) #max width df display

In [None]:
# Loading data for Autocar
wines = pd.read_csv('../input/dbwmlda/dbwmalda2.csv')
wines.head(10)
wines.text[15506]


## wines.shape

In [None]:
# *** Regular Expresion Cleaning
import re
import string
def clean_text_round1(text):
    text=re.sub("_x000D_",'',text)
    text=re.sub("Autocar",'',text)
    text=re.sub("AUTOCAR",'',text)
    text=re.sub("retorque",'',text)
    text=re.sub("RETORQUE",'',text)
    text=re.sub("Retorque",'',text)
    text=re.sub('\n','',text)
    text=re.sub('\(.*?\)','',text)
    text=re.sub('WHEELSRIMHUB','',text)
    text=re.sub('CABSHEET','',text)
    text=re.sub('ENGINEPOWER','',text)
    text=re.sub('LIGHTING','',text)
    text=re.sub('EXPENDABLEITMES','',text)
    text=re.sub('BRAKE','',text)
    text=re.sub('COOLSYSTEM','',text)
    text=re.sub('EXHAUST','',text)
    text=re.sub('C1:','',text)
    text=re.sub('C2:','',text)
    text=re.sub('C3:','',text)
    text=re.sub('Complaint:','',text)
    text=re.sub('CHARGING:','',text)
    text=re.sub('warranty','',text)
    text=re.sub('Warranty','',text)
    text=re.sub('WARRANTY','',text)
    text=re.sub('cancel','',text)
    text=re.sub('repair','',text)
    text=re.sub('complaint--','',text)
    text=re.sub('complaint','',text)
    text=re.sub("[^A-Za-z']+", ' ',text)
    text=re.sub('Deferred','',text)
    text=re.sub('Order','',text)
    text=re.sub('order','',text)
    return(text)

round1 = lambda x: clean_text_round1(x)

In [None]:
data_clean=pd.DataFrame(wines.text.apply(round1)) #wines is the data frame and 'text' is the name of the field bien processed
data_clean.head(10)


In [None]:
wines.text=data_clean

## Registra el numero de temas a identificar en la siguiente línea

In [None]:
#Aqui se define el numero de temas a buscar y el nombre del campo del data frame que contiene la informacion a analizar
total_topics=4
field_source='text'

In [None]:
#Updating the wine df with the processed query
#wines=newdf
wines.shape

In [None]:
#seleccionando campo fuente del dataframe 
wines[field_source][3]

In [None]:
# Creating a spaCy object
nlp = spacy.load('en_core_web_lg')

In [None]:
#Verficando que se esta procesando el registro correcto
doc = nlp(wines[field_source][3])
spacy.displacy.render(doc, style='ent',jupyter=True)

## Econtrando la raíz de los términos (Lemmatization)
## Creación de lista de puntuación y palabras comunes que no añaden mucha informaciónCY


In [None]:
#definicion de puntuacion y STOP WORDS
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
#stopwords.append('wheelsrimhub')# added 'empresa' as a Stop Word
#stopwords.append('x000d')

In [None]:
#Quitar signo de comentarios para revisar listas de stopwords y punctuations
#stopwords
#punctuations

In [None]:
review = str(" ".join([i.lemma_ for i in doc]))

In [None]:
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

## Detectando los componentes dentro de las frases ###



In [None]:
# POS tagging
for i in nlp(review):
    print(i,"=>",i.pos_)

In [None]:
# Parser para las frases escritas Añadi len word >4 to filter out short words with no meaning in this exercise
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    #mytokens = [ word.lower_ for word in mytokens ] #*either this line or the next one
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] #***this section keeps only the lemma removed
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations and len(word)>4] #Eliminating short words
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [None]:
# utilizando la variable field_source como la columna a analizar de la base de datos
spacy_tokenizer(wines[field_source][4])

In [None]:
# Applies the parser and filters information to get a processed data frame
# Important selection on the column of the wines field_source
tqdm.pandas()
wines["processed_source"] = wines[field_source].progress_apply(spacy_tokenizer)

In [None]:
wines["processed_source"][0:10]

## Encontrando los temas principales entre todos los documentos

## Creación de un vector de términos


## Aquí es donde se hace el fit a la vectorizacion de la data!

In [None]:
# Creating a vectorizer version without the stop words option
vectorizer = CountVectorizer( lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(wines["processed_source"]) #procesando processed source
#type(data_vectorized)
data_vectorized.shape

In [None]:
NUM_TOPICS = total_topics #parametro desde el principio

In [None]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True,random_state=42)
data_lda = lda.fit_transform(data_vectorized)
lda.components_.shape
#type(data_lda)
#print(data_lda.shape)
#type(data_lda)

In [None]:
# Non-Negative Matrix Factorization Model
#nmf = NMF(n_components=NUM_TOPICS)
#data_nmf = nmf.fit_transform(data_vectorized) 

In [None]:
# Latent Semantic Indexing Model using Truncated SVD
#lsi = TruncatedSVD(n_components=NUM_TOPICS)
#data_lsi = lsi.fit_transform(data_vectorized)

In [None]:
# function printing keywords for each topic       *** You may select top terms ***
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):    #idx and topic become a vector for each iteration through the enumerate command
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i])     #very cleverly algins the names in vectorizer with the topics in model components
                        for i in topic.argsort()[:-top_n - 1:-1]]) # i here is the index orders in descinding mode keep the first 10 value stop value is -11 to 

In [None]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

## Habilitar para análsis NMF

In [None]:
# Keywords for topics clustered by Latent Semantic Indexing
#print("NMF Model:")
#selected_topics(nmf, vectorizer)

## Habilitar para análsis LSI

In [None]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
#print("LSI Model:")
#selected_topics(lsi, vectorizer)

## Identificando documentos correspondientes a cada tema

In [None]:
# function printing documents belonging to each topic  *** You may select # of top documents ***
def selected_documents(model,wines2,top_n=10):
    numtemas, tempvar = model.components_.shape
    print("Printing Document belonging to each topic")
    #print (doc_topic[ptop_indices]) #Uncomment to see probabilities by topic
    for idx in range(numtemas): #idx and topic become a vector for each iteration through each topic 
        print(" ")
        print("Topic %d:" % (idx))
        print(" ")
        ptop_indices=np.argsort(doc_topic[:,idx])[:-top_n-1:-1] #sorting the indices by the offset column 1 in reverse order, top indices gives the order document
        print(wines2[ptop_indices])
        #print (doc_topic[ptop_indices]) #Uncomment to see probabilities by topic
        if (idx==0):
            dfcsvtmp=wines2[ptop_indices].to_frame()  #note the use to frame method as the results were just a series
            dfcsvtmp.insert(1,"Tema",idx) #during the first iteration add the column named tema with value idx zero in first iteration
            print(idx)
        else:
            dfcsvtmp2=wines2[ptop_indices].to_frame() #note the use to frame method as the results were just a series
            dfcsvtmp2.insert(1,"Tema",idx) # use a tmp dataframe to add the column named tema and fill it with the current idx iteration
            dfcsvtmp=dfcsvtmp.append(dfcsvtmp2) # append to previous data frame including the tema column
    return dfcsvtmp #returning the data frame including all comments and temas
       

In [None]:
# for each document calculate the probability for each topic in column form *** VERY IMPORTANT ***
doc_topic=lda.transform(data_vectorized) 
#Top documents by topic
# You can change to the label "processed_source" or variable field_source below to see the parsed sentences in the following line
dfcsv = selected_documents(lda,wines[field_source].str.lower()) #converting results to lower case
doc_probs=pd.DataFrame(doc_topic) #creating a dataframe on doc_topic


In [None]:
dfcsv



In [None]:
doc_probs.iloc[4059,:4]

## Seccion de prueba para una frase

In [None]:
tableau=pd.concat([dfcsv, doc_probs], axis=1, join='inner') #inner joing by index of top n doc with idx and probs ****

In [None]:
tableau

In [None]:
#Writing comments with probabilities by topic
#tableau.to_csv('topicdb_eng.csv')

In [None]:
# Example of transformation of an individual sentence with its probabilities for each topic
text = spacy_tokenizer("Check engine fial and oil leak")
x = lda.transform(vectorizer.transform([text]))[0]
print('Probabilities by topic (count start with zero): ',x)
print()
print("El tema con mayor probabilidad usando modelo LDA es el tema #",np.argmax(x))

## Visualizando los Resultado de LDA con la utilería pyLDAvis

In [None]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

In [None]:
#extracting the LDA visualizaiton model componentes from the dash class just created, coordinates
coordinates = pd.DataFrame(dash.topic_coordinates)
coordinates.head(10)
coordinates.to_csv("coordinates_eng.csv")

In [None]:
#extracting the LDA visualizaiton model componentes from the dash class just created, Topic Info
topic_info=pd.DataFrame(dash.topic_info)
topic_info.head(10)
topic_info.to_csv('topic_info_eng.csv')

In [None]:
#extracting the LDA visualizaiton model componentes from the dash class just created, Token information
tok_table=pd.DataFrame(dash.token_table)
tok_table.head(10)
tok_table.to_csv('tok_table_eng.csv')

In [None]:
#pyLDAvis.enable_notebook()
#dash = pyLDAvis.sklearn.prepare(nmf, data_vectorized, vectorizer, mds='tsne')
#dash

## Cómo interpretar ésta gráfica?
1. Los temas están a la izquierda y sus palabras respectivas a la derecha.
2. Los temas más grandes con más frecuents y entre más cercanos más parecidos son.
3. La selección de las palabras está basada en su capacidad de diferenciación y frecuencia.

**Selecciona el tema para ver sus palabras correspondientes.**

## Biagram spaCy tokenizer para la identificación de temas

In [None]:
def spacy_bigram_tokenizer(phrase):
    doc = parser(phrase) # create spacy object
    token_not_noun = []
    notnoun_noun_list = []
    noun = ""

    for item in doc:
        if item.pos_ != "NOUN": # separate nouns and not nouns
            token_not_noun.append(item.text)
        if item.pos_ == "NOUN":
            noun = item.text
        
        for notnoun in token_not_noun:
            notnoun_noun_list.append(notnoun + " " + noun)

    return " ".join([i for i in notnoun_noun_list])

In [None]:
bivectorizer = CountVectorizer(min_df=5, max_df=0.9, lowercase=True, ngram_range=(1,2))
bigram_vectorized = bivectorizer.fit_transform(wines["processed_source"])

## LDA para información procesada con Biagram

In [None]:
bi_lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_bi_lda = bi_lda.fit_transform(bigram_vectorized)

### Temas pare el modelo de bigram 

In [None]:
print("Bi-LDA Model:")
selected_topics(bi_lda, bivectorizer)

In [None]:
bi_dash = pyLDAvis.sklearn.prepare(bi_lda, bigram_vectorized, bivectorizer, mds='tsne')
bi_dash

In [None]:
import pandas as pd
master_list = [['cat', 123, 'yellow'], ['dog', 12345, 'green'], ['horse', 123456, 'red']]
df = pd.DataFrame(master_list)

df

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.colheader_justify','light', 'display.width', 2000, 'display.max_colwidth', 500):
    df = df.stack().str.lstrip().unstack()
    df = df.style.set_properties(**{'text-align': 'left'})
df