### Consumer complaint classification

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score

In [None]:
df = pd.read_csv("Consumer_Complaints.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
col = ['Product', 'Consumer Complaint']
df= df[col]

In [None]:
df= df[pd.notnull(df['Consumer Complaint'])]

In [None]:
df.columns=['Product', 'Consumer_complaint']

In [None]:
df['category_id'] = df['Product'].factorize()[0]

In [None]:
cat_id_df = df[["Product", "category_id"]].drop_duplicates().sort_values('category_id')

In [None]:
cat_to_id = dict(cat_id_df.values)

In [None]:
id_to_cat = dict(cat_id_df[['category_id','Product']].values)

In [None]:
df.head()

In [None]:
fig = plt.figure(figsize= (8,6))

In [None]:
df.groupby('Product').Consumer_complaint.count().plot.bar(ylim=0)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(sublinear_tf= True, #use a logarithmic form for frequency
                       min_df = 5, #minimum numbers of documents a word must be present in to be kept
                       norm= 'l2', #ensure all our feature vectors have a euclidian norm of 1
                       ngram_range= (1,2), #to indicate that we want to consider both unigrams and bigrams.
                       stop_words ='english') #to remove all common pronouns to reduce the number of noisy features

In [None]:
features = tfidf.fit_transform(df.Consumer_complaint).toarray()

In [None]:
labels = df.category_id
features.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Consumer_complaint'], df['Product'], random_state= 0)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vect = CountVectorizer()

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

### Model Building

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC().fit(X_train_tfidf, y_train)

In [None]:
print(clf.predict(count_vect.transform(['I have outdated information on my credit repor'])))

In [None]:
y_pred = clf.predict(count_vect.transform(X_test))

In [None]:
from sklearn import metrics

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
text_clf=Pipeline([('tf',TfidfVectorizer(sublinear_tf= True, 
                       min_df = 5, 
                       norm= 'l2', 
                       ngram_range= (1,2), 
                       stop_words ='english') ),
                 ('clf',LinearSVC())])

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
text_clf.predict(['I have outdated information on my credit repor'])[0]

In [None]:
cv_results = cross_val_score(text_clf, 
                                 X_train, y_train, 
                                 cv=5,
                                 scoring="accuracy",
                                 n_jobs=-1)
print(np.mean(cv_results))

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
mb=BernoulliNB()

In [None]:
text_clf=Pipeline([('tf',TfidfVectorizer(sublinear_tf= True, 
                       min_df = 5, 
                       norm= 'l2', 
                       ngram_range= (1,2), 
                       stop_words ='english') ),
                 ('clf',BernoulliNB())])

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
text_clf.predict(['I have outdated information on my credit repor'])[0]

In [None]:
y_pred=text_clf.predict(X_test)

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
cv_results = cross_val_score(text_clf, 
                                 X_train, y_train, 
                                 cv=5,
                                 scoring="accuracy",
                                 n_jobs=-1)
print(np.mean(cv_results))

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
text_clf=Pipeline([('tf',TfidfVectorizer(sublinear_tf= True, 
                       min_df = 5, 
                       norm= 'l2', 
                       ngram_range= (1,2), 
                       stop_words ='english') ),
                 ('clf',DecisionTreeClassifier())])

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
text_clf.predict(['I have outdated information on my credit repor'])[0]

In [None]:
y_pred=text_clf.predict(X_test)

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
cv_results = cross_val_score(text_clf, 
                                 X_train, y_train, 
                                 cv=5,
                                 scoring="accuracy",
                                 n_jobs=-1)
print(np.mean(cv_results))

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
text_clf=Pipeline([('tf',TfidfVectorizer(sublinear_tf= True, 
                       min_df = 5, 
                       norm= 'l2', 
                       ngram_range= (1,2), 
                       stop_words ='english') ),
                 ('clf',CatBoostClassifier())])

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
ptext_clf.predict(['I have outdated information on my credit repor'])[0]

In [None]:
y_pred=text_clf.predict(X_test)

In [None]:
y_pred

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
cv_results = cross_val_score(text_clf, 
                                 X_train, y_train, 
                                 cv=5,
                                 scoring="accuracy",
                                 n_jobs=-1)
print(np.mean(cv_results))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier()

In [None]:
text_clf=Pipeline([('tf',TfidfVectorizer(sublinear_tf= True, 
                       min_df = 5, 
                       norm= 'l2', 
                       ngram_range= (1,2), 
                       stop_words ='english') ),
                 ('clf',RandomForestClassifier())])

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
text_clf.predict(['I have outdated information on my credit repor'])[0]

In [None]:
y_pred=text_clf.predict(X_test)

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
cv_results = cross_val_score(text_clf, 
                                 X_train, y_train, 
                                 cv=5,
                                 scoring="accuracy",
                                 n_jobs=-1)

In [None]:
print(np.mean(cv_results))

In [None]:
from xgboost import XGBClassifier

In [None]:
text_clf=Pipeline([('tf',TfidfVectorizer(sublinear_tf= True, 
                       min_df = 5, 
                       norm= 'l2', 
                       ngram_range= (1,2), 
                       stop_words ='english') ),
                 ('clf',XGBClassifier())])

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
y_pred=text_clf.predict(X_test)

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
cv_results = cross_val_score(text_clf, 
                                 X_train, y_train, 
                                 cv=5,
                                 scoring="accuracy",
                                 n_jobs=-1)

In [None]:
print(np.mean(cv_results))

In [None]:
from lightgbm import LGBMClassifier

In [None]:
text_clf=Pipeline([('tf',TfidfVectorizer(sublinear_tf= True, 
                       min_df = 5, 
                       norm= 'l2', 
                       ngram_range= (1,2), 
                       stop_words ='english') ),
                 ('clf',LGBMClassifier())])

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
y_pred=text_clf.predict(X_test)

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
cv_results = cross_val_score(text_clf, 
                                 X_train, y_train, 
                                 cv=5,
                                 scoring="accuracy",
                                 n_jobs=-1)

In [None]:
print(np.mean(cv_results))

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [None]:
 #Randomized Search CV
 import numpy as np
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [None]:
# Use the random grid to search for best hyperparameters
rf=RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='accuracy', n_iter = 10, cv = 5,n_jobs=-1)

In [None]:
rf.fit(X_train_tfidf, y_train)

In [None]:
rf.best_params_

In [None]:
rf.best_score_

In [None]:
from scipy.stats import uniform, randint

In [None]:
xgb=XGBClassifier()

In [None]:
params = {
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

In [None]:
xgb = RandomizedSearchCV(estimator = xgb, param_distributions = params,scoring='accuracy', n_iter = 10, cv = 5, n_jobs = -1)

In [None]:
xgb.fit(X_train_tfidf, y_train)

In [None]:
xgb.best_params_

In [None]:
xgb.best_score_

### Stacking Classifier

In [None]:
from sklearn.ensemble import StackingClassifier
estimators = [('rf', rf), ('mb', mb),('ab',XGBClassifier())]

clf = StackingClassifier(estimators=estimators)

In [None]:
clf.fit(X_train_tfidf, y_train)

In [None]:
cv_results = cross_val_score(clf, 
                                 X_train_tfidf, y_train, 
                                 cv=5,
                                 scoring="accuracy",
                                 n_jobs=-1)
print(np.mean(cv_results))