In [1]:
import spacy
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer # Stemming
from nltk.tokenize import RegexpTokenizer # Tokenizing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import seaborn as sns

SyntaxError: invalid syntax (<ipython-input-1-f3035fa09f71>, line 2)

In [None]:
df = pd.read_csv('complaints.csv')
df.head()

In [None]:
df['product_id'] = df['Product'].factorize()[0]
product_id_df = df[['Product', 'product_id']].drop_duplicates().sort_values('product_id')
product_to_id = dict(product_id_df.values)
id_to_product = dict(product_id_df[['product_id', 'Product']].values)

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer("english")

def clean_text(text):

  text = text.lower()
  text = text.replace('{html}', "")
  text = re.sub(r'[^\w\s]', ' ', text)
  text = re.sub('[0-9]+', '', text)

  tokens = tokenizer(text)
  tokens = [stemmer.stem(t) for t in tokens]
  tokens = [t for t in tokens if len(t) > 2]
  cleanedText= " ".join(tokens)

  return cleanedText

In [None]:
df['input'] = df['Consumer complaint narrative'].map(lambda x: clean_text(x))

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
countVec = CountVectorizer(stop_words=stop_words)
features = countVec.fit_transform(df.input).toarray()
labels = df.product_id
features.shape

In [None]:
N=5

for Product, product_id in sorted(product_to_id.items()):

  features_chi2 = chi2(features, labels == product_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(countVec.get_feature_names())[indices]

  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  trigrams = [v for v in feature_names if len(v.split(' ')) == 3]

  print("# '{}':".format(Product))
  print(" . Most correalted unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print(" . Most correalted bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
  print(" . Most correalted trigrams:\n. {}".format('\n. '.join(trigrams[-N:])))

##Understanding Cross Validation:
###https://www.geeksforgeeks.org/cross-validation-machine-learning/

In [None]:
models = [
    RandomForestClassifier(n_estimators = 200, max_depth = 3, random_state = 42)
    LinearSVC()
    MultinomialNB()
    LogisticRegression(random_state = 42)
]

CV = 5
cv_df = pd.DataFrame(index=range(CV*len(models)))
entries = []

for model in models:

  model_name = model._class_._name_
  accuracies = cross_val_score(model, features, scoring = 'accuracy', cv=CV)

  for fold_idx, accuracy in enumerate(accuracies):

    entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns = ['model_name', 'fold_idx', 'accuracy'])

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, size=8, jitter=True, edgecolour='grey', linewidth=2)

plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()