# Capstone Project 2

## Data Exploration and Cleaning

### Import Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import os
import glob
import matplotlib as mpl
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
mpl.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (8,6)
mpl.rcParams['font.size'] = 12

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
type(train)

In [None]:
train

In [None]:
test.head()

In [None]:
print(train.comment_text[0])

In [None]:
list(train)

### Not all comments are toxic as shown above, however there are no other comment identifiers used for comments that are not toxic, severe_toxic, obscene, threat, insult, or identity_hate.

## Check for missing values

In [None]:
print("Check for missing values in Train dataset")
null_check=train.isnull().sum()
print(null_check)

In [None]:
print("Check for missing values in Test dataset")
null_check=test.isnull().sum()
print(null_check)
print("filling NA with \"unknown\"")
train["comment_text"].fillna("unknown", inplace=True)
test["comment_text"].fillna("unknown", inplace=True)

## Determine the number of each comment type

In [None]:
toxic = train['toxic'].sum()
severe_toxic = train['severe_toxic'].sum()
obscene = train['obscene'].sum()
threat = train['threat'].sum()
insult = train['insult'].sum()
identity_hate = train['identity_hate'].sum()

In [None]:
print ('toxic comments', toxic)
print('severe toxic comments', severe_toxic)
print('obscene comments', obscene)
print('threatening comments', threat)
print('insulting comments', insult)
print('identity hate comments', identity_hate)

## Bar chart showing comment types

In [None]:
import matplotlib.pyplot as plt

num_comments = [toxic, severe_toxic, obscene, threat, insult, identity_hate]

objects = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print(objects)
pos = np.arange(len(objects))
print (pos)
num_comments = [toxic, severe_toxic, obscene, threat, insult, identity_hate]
print(num_comments)
 
plt.bar(objects, num_comments, align='center', alpha=0.5)

plt.show()

## How many comments are friendly?

In [None]:
friendly = train.iloc[:,2:].sum(axis = 1)

In [None]:
train['friendly'] = (friendly == 0)
print ("Total comments ", len(train))
print ("Total clean comments ", train['friendly'].sum())

### Check the column names for the dataframe: train

In [None]:
list(train)

## Create correlation plot to see if some comments go together

In [None]:
# Take clean comment count out of df

corr_df = train.iloc[:,2:-1]

In [None]:
corr = corr_df.corr()

plt.figure(figsize = (10,8))
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values, annot = True)
plt.show()

### Concatenate the train set and the test set and make a new dataframe, comment_length, containing the length of comments. Print out the new dataframe.

In [None]:
df = pd.concat([train, test], axis=0)

print(list(df))

comment_length = df['comment_text'].apply(len)

print(comment_length)

In [None]:
df_len = pd.Series.to_frame(comment_length)

### Assign 'comment_text'  as column name.

In [None]:
df_len.columns = ['comment_text']

In [None]:
# Check the type of df_len

type(df_len)

In [None]:
# verify column label

list(df_len)

## Plot number of letters in each comment

In [None]:
import matplotlib.pyplot as plt


letter_plot = df_len.plot(use_index = True, kind = 'hist', normed = 1, bins = 100, legend = None)
plt.title('Number of Letters in each Comment')

plt.show()

## Determine number of words in each comment and plot distribution

In [None]:
df.head()

In [None]:
df['word_count'] = df['comment_text'].apply(lambda x: len(str(x).split(" ")))
df[['comment_text','word_count']].head()

In [None]:
df['word_count'].max()

In [None]:
letter_plot = df['word_count'].plot(use_index = True, kind = 'hist', normed = 1, bins = 50, logx = True)
plt.title('Number of Words in each Comment')
bins = 25
legend = None

plt.show()

##  Drop comments less than 10 words in length from the data  frame

In [None]:
df = df.drop(df[df.word_count < 10].index)

In [None]:
letter_plot = df['word_count'].plot(use_index = True, kind = 'hist', normed = 1, bins = 50, logx = True)
plt.title('Number of Words in each Comment')
bins = 25
legend = None

plt.show()

In [None]:
df.word_count.count()

## Repeat the above steps for the separate train and test data sets

In [None]:
comment_length_train = train['comment_text'].apply(len)

In [None]:
train_len = pd.Series.to_frame(comment_length_train)

In [None]:
train_len.columns = ['comment_text']

In [None]:
type(train_len)

In [None]:
train['word_count'] = train['comment_text'].apply(lambda x: len(str(x).split(" ")))
train[['comment_text','word_count']].head()

In [None]:
letter_plot = train['word_count'].plot(use_index = True, kind = 'hist', normed = 1, bins = 50, logx = True)
plt.title('Number of Words in each Comment - Train Data')
bins = 25
legend = None

plt.show()

In [None]:
comment_length_test = test['comment_text'].apply(len)

test_len = pd.Series.to_frame(comment_length_test)

test_len.columns = ['comment_text']

test['word_count'] = test['comment_text'].apply(lambda x: len(str(x).split(" ")))
test[['comment_text','word_count']].head()

In [None]:
letter_plot = test['word_count'].plot(use_index = True, kind = 'hist', normed = 1, bins = 50, logx = True)
plt.title('Number of Words in each Comment - Test Data')
bins = 25
legend = None

plt.show()

In [None]:
train.head()

## Find out which comments have multiple labels

In [None]:
columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
train['how_many'] = train[columns].sum(axis = 1)

count_mult_class = train['how_many'].value_counts()

print (count_mult_class)

### From the above output:
####  5470 comments have one label
####  2678 comments have two labels
####  etc.

## Examples of comment types: 
### toxic, severe_toxic, obscene, threat, insult, identity_hate

In [None]:
print("toxic: ")
print(train[train.toxic == 1].iloc[1,1])

In [None]:
print("severe_toxic: ")
print(train[train.severe_toxic == 1].iloc[1,1])

In [None]:
print("obscene: ")
print(train[train.obscene == 1].iloc[1,1])

In [None]:
print("threat: ")
print(train[train.threat == 1].iloc[1,1])

In [None]:
print("insult: ")
print(train[train.insult == 1].iloc[1,1])

In [None]:
print("identity_hate: ")
print(train[train.identity_hate == 1].iloc[1,1])

## Drop comments less than 10 words in length from the Train data set

In [None]:
train = train.drop(train[train.word_count < 10].index)

letter_plot = train['word_count'].plot(use_index = True, kind = 'hist', normed = 1, bins = 50, logx = True)
plt.title('Number of Words in each Comment - Train Comments > than 10 words long')
bins = 25
legend = None

plt.show()

In [None]:
train.word_count.count()

## Drop comments less than 10 words in length from the Test data set

In [None]:
test = test.drop(test[test.word_count < 10].index)

letter_plot = test['word_count'].plot(use_index = True, kind = 'hist', normed = 1, bins = 50, logx = True)
plt.title('Number of Words in each Comment - Test Comments > than 10 words long')
bins = 25
legend = None

plt.show()

In [None]:
def f(row):
    if row['toxic'] == 1:
        val = 0
    elif row['severe_toxic'] == 1:
        val = 1
    elif row ['obscene'] == 1:
        val = 2
    elif row['threat'] == 1:
        val = 3
    elif row['insult'] == 1:
        val = 4
    elif row['identity_hate'] == 1:
        val = 5
    else:
        val = -1
    return val

In [None]:
train['type_comment'] = train.apply(f, axis=1)

In [None]:
train.head()

## Visualize comments using Wordclouds

In [None]:
friendly_subset = train[train.friendly == True]

text = friendly_subset.comment_text.values
wc = WordCloud(background_color = 'black', max_words = 2000, stopwords = stopword)
wc.generate(" ".join(text))
plt.figure(figsize = (20, 10))
plt.axis('off')
plt.title('Frequent Friendly Words')
plt.imshow(wc.recolor(colormap = 'viridis, random_state = 20'), alpha = 0.98)
plt.show()

## Tokenization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.head()

In [None]:
y = train.type_comment

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train['comment_text'], y, test_size = 0.33, random_state = 53)

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['comment_text'], y, test_size = 0.33, random_state = 53)

In [None]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

## Create a numerical feature vector for each comment

## Initialize a CountVectorizer object

In [None]:
count_vectorizer = CountVectorizer(stop_words = 'english')

vect = CountVectorizer(tokenizer=tokenize)
tf_train = vect.fit_transform(X_train)
tf_test = vect.transform(X_test)

In [None]:
tf_train

In [None]:
tf_test

## Build a classifier

In [None]:
print(tf_train)

In [None]:
print(tf_test)

## Transform the test data using only the text column values

In [None]:
count_test = count_vectorizer.fit_transform(X_test)

## Print the first 10 features of the count_vectorizer

In [None]:
print(count_vectorizer.get_feature_names()[:10])

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train) 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
knn.predict(X_test)

## Initialize a TfidVectorizer object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

## Transform the training data

In [None]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

print(tfidf_train)

## Transform the test data

In [None]:
tfidf_test = tfidf_vectorizer.transform(X_test)

## Print the first 10 features

In [None]:
print(tfidf_vectorizer.get_feature_names()[:10])

## Print the first 5 vectors of the tfidf training data

In [None]:
print(tfidf_train[:5].A)

In [None]:
type(count_train)

import scipy
from scipy.sparse import csr_matrix
#csr_matrix(count_train).toarray()
#count_train.A
count_train.todense()
#count_train = count_train.A

## Creat the CountVectorizer dataframe

In [None]:
count_df = pd.DataFrame(count_train, columns=count_vectorizer.get_feature_names())

In [None]:
print (count_train)

In [None]:
print(count_vectorizer.get_feature_names())

## Trying to figure out the count_train.A

In [None]:
print(tfidf_train[:5])

In [None]:
print(count_train.A)

In [None]:
count_df = pd.DataFrame(count_train.A, columns =  count_vectorizer.get_feature_names())