# Natural Language Processing

In [1]:
import pandas as pd
import numpy as np
# Word cloud plots for Names
from wordcloud import WordCloud,ImageColorGenerator
from scipy.misc import imread
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
# LDA & LSI packages
import nltk
nltk.download('punkt')
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import similarities
# Random forest packages
from sklearn.feature_extraction.text import TfidfVectorizer as tfid
from sklearn.naive_bayes import MultinomialNB as multi
from sklearn.ensemble import RandomForestClassifier as randomf
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report as report

[nltk_data] Downloading package punkt to /Users/elainny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  from numpy.core.umath_tests import inner1d


In [2]:
# import datasets
df = pd.read_csv('Data/cleaned-data.csv')
train = pd.read_csv('Data/cleaned-train.csv')
test = pd.read_csv('Data/cleaned-test.csv')

In [3]:
# Select useful columns for NLP
df_nlp = df[['AdoptionSpeed','Description','Name','DataType','Type']]
df_nlp.head(10)

Unnamed: 0,AdoptionSpeed,Description,Name,DataType,Type
0,2.0,Nibble is a 3+ month old ball of cuteness. He ...,Nibble,train,Cat
1,0.0,I just found it alone yesterday near my apartm...,No Name Yet,train,Cat
2,3.0,Their pregnant mother was dumped by her irresp...,Brisco,train,Dog
3,2.0,"Good guard dog, very alert, active, obedience ...",Miko,train,Dog
4,2.0,This handsome yet cute boy is up for adoption....,Hunter,train,Dog
5,2.0,This is a stray kitten that came to my house. ...,No name,train,Cat
6,1.0,anyone within the area of ipoh or taiping who ...,BULAT,train,Cat
7,3.0,Siu Pak just give birth on 13/6/10 to 6puppies...,Siu Pak & Her 6 Puppies,train,Dog
8,1.0,"healthy and active, feisty kitten found in nei...",No name,train,Cat
9,4.0,"Very manja and gentle stray cat found, we woul...",Kitty,train,Cat


## Word Clouds

In [None]:
# word cloud: cat name
plt.figure(figsize=(20, 8))
bg_pic = imread('dog-paw.png')
image_colors = ImageColorGenerator(bg_pic)


cat_name = ' '.join(df.loc[df['Type'] == 'Cat', 'Name'].fillna('').values)
wc_cat = WordCloud(mask=bg_pic,background_color='white',scale=20,max_words=300).generate(cat_name)

plt.imshow(wc_cat.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.savefig('cat.png')
#plt.show()

In [None]:
# word cloud: dog name
plt.figure(figsize=(20, 8))
dog_name = ' '.join(df.loc[df['Type'] == 'Dog', 'Name'].fillna('').values)
wc_dog = WordCloud(mask=bg_pic,background_color='white',scale=20,max_words=300).generate(dog_name)
plt.imshow(wc_dog.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.savefig('dog.png')
plt.show()

## LDA & LSI model

In [None]:
# create train_data by selecting Description columns in train data
train_data = train['Description'].values.tolist()

In [None]:
# define stopwords, lemmatizer, and data cleaning process
stop_words = set(stopwords.words("english"))
#Lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def cleandata(review) :
    clean_des = re.sub('[^a-zA-Z]', ' ', str(review)) # Remove punctuation/words not starting with alphabet
    clean_des = clean_des.lower() # make words lower cases
    words = word_tokenize(clean_des) # tokenize
    words = [w for w in words if not w in stop_words] # stop words removal
    words = [wordnet_lemmatizer.lemmatize(w) for w in words] #Lemmatize words
    return words

In [None]:
# Define the number of topics
t = 10

cleaned = []
for description in train_data:
    cleaned.append(cleandata(description))
# Create a Dictionary associate word to id
D = corpora.Dictionary(cleaned)

# Transform texts to numeric
corpra = [D.doc2bow(i) for i in cleaned]

In [None]:
# Build the LDA model
lda = models.LdaModel(corpus=corpra, num_topics=t, id2word=D)

print('LDA model')
for index in range(0,t):
    # top 9 topics
    print("Topic Number %s:" % str(index+1), lda.print_topic(index, 9))
print("-" * 117)

In [None]:
# Build the LSI model
lsi = models.LsiModel(corpus=corpra, num_topics=t, id2word=D)

print('LSI model')
for index in range(0,t):
    # top 9 topics
    print("Topic Number %s:" % str(index+1), lsi.print_topic(index, 9))
print("-" * 117)

In [None]:
# randomly pick one description from test to predict similarity.
import random
i = random.randint(1,3948) # since my test dataset has 3498 values 
print(i)
test_data = test.loc[i,'Description']
print('-----This is the description from test that I am going to predict:-----')
print(test_data)

In [None]:
# compare LDA model and LSI model to predict similarity.
lda_i = similarities.MatrixSimilarity(lda[corpra])
m = D.doc2bow(cleandata(test_data))
# perform some queries
similar_lda = lda_i[lda[m]]
# Sort the similarities
LDA = sorted(enumerate(similar_lda), key=lambda item: -item[1])
# Top 10 most similar documents:
print(LDA[:10])
# the most similar document
doc_id, similarity = LDA[1]
print(train_data[doc_id][:100])

In [None]:
# Do the same similarity queries by using LSI model
lsi_i = similarities.MatrixSimilarity(lsi[corpra])
similar_lsi = lsi_i[lsi[m]]
LSI = sorted(enumerate(similar_lsi), key=lambda item:-item[1])
print(LSI[:10])
doc_id_lsi, similarity_lsi = LSI[1]
print(train_data[doc_id][:100])

## Random Forest

In [4]:
# Choose the correct columns and remove null values
train = train[['Description','AdoptionSpeed']]
train_null = np.array(train[train['Description'].isnull() == True].index)
train = train.drop(train_null)
train.shape

(14981, 2)

In [5]:
# Analyze Descrtion based on AdoptionSpeed, so need to seperate train into 5 groups
train0 = np.array(train[train['AdoptionSpeed'] == 0].index)
train1 = np.array(train[train['AdoptionSpeed'] == 1].index)
train2 = np.array(train[train['AdoptionSpeed'] == 2].index)
train3 = np.array(train[train['AdoptionSpeed'] == 3].index)
train4 = np.array(train[train['AdoptionSpeed'] == 4].index)

adoption1 = [train1[i] for i in range(len(train0))]
adoption2 = [train2[i] for i in range(len(train0))]
adoption3 = [train3[i] for i in range(len(train0))]
adoption4 = [train4[i] for i in range(len(train0))]

In [6]:
# Combine the X and Y dataset according to different AdoptionSpeeds
X = pd.concat([train['Description'].reindex(train0), 
               train['Description'].reindex(adoption1),
               train['Description'].reindex(adoption2), 
               train['Description'].reindex(adoption3),
               train['Description'].reindex(adoption4)])

Y = pd.concat([train['AdoptionSpeed'].reindex(train0), 
               train['AdoptionSpeed'].reindex(adoption1),
               train['AdoptionSpeed'].reindex(adoption2), 
               train['AdoptionSpeed'].reindex(adoption3),
               train['AdoptionSpeed'].reindex(adoption4)])

In [7]:
# Split the X and Y data into train and valid:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=.2, random_state=42)
X_train.shape, Y_train.shape, X_valid.shape, Y_valid.shape

((1640,), (1640,), (410,), (410,))

In [8]:
# Define data cleaning process(tokenize, lower cases, lemmatizer) for random forest
# and apply data cleaning on X_train and X_valid

wordnet_lemmatizer = WordNetLemmatizer() # same as LDA and LSI but do not remove stop words

def cleandata_rf(reviews):
    token = [word_tokenize(i) for i in reviews]
    token1 = [[d.lower() for d in words if d.isalpha() == True] for words in token]
    lemma = [[wordnet_lemmatizer.lemmatize(word) for word in doc] for doc in token1]
    review = [" ".join(i) for i in lemma]
    return review

In [9]:
# Do data cleaning
X_train_clean = cleandata_rf(X_train)
X_valid_clean = cleandata_rf(X_valid)

In [10]:
#Machine Learning model(randomforest) on Description
tfvec = tfid(stop_words='english', ngram_range=(1, 1), lowercase=False)
mb = multi()
randomforest = randomf()
pipe = Pipeline([('vectorizer', tfvec),('rf', mb)])

In [11]:
# Y_pred and report
pipe.fit(X_train_clean, Y_train)
Y_pred = pipe.predict(X_valid_clean)
print(report(Y_valid, Y_pred))

             precision    recall  f1-score   support

          0       0.44      0.31      0.36        87
          1       0.32      0.24      0.27        79
          2       0.13      0.27      0.17        66
          3       0.27      0.24      0.26        94
          4       0.30      0.21      0.25        84

avg / total       0.30      0.26      0.27       410



  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
