<img src="http://bigdataexperience.org/BX/wp-content/uploads/2016/05/BX-FULL.png" width="200" height="200" alt="Big Data Experience Center, King Mongkut's University of Technology Thonburi">

# <center>Text Mining</center>
# <center>Module 5 - Text Classification</center>
---

# 1. Load data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
df = pd.read_pickle('consumer_complaint_dataset.data', compression='gzip')
df.head()

In [None]:
df.shape

In [None]:
df

In [None]:
pd.crosstab(df.topic, columns='Count')\
  .sort_values(by='Count', ascending=False)

# 2. Label grouping / cleaning

- Group "Credit reporting" into "Credit reporting, credit repair services, or other personal consumer reports".
- Group "Credit card"/"Prepaid card" into "Credit card or prepaid card".
- Group "Payday loan" into "Payday loan, title loan, or personal loan".
- Group "Virtual currency"/"Money transfers" into "Money transfer, virtual currency, or money service".
- Remove "Other financial service"

In [None]:
df.loc[df['topic'] == 'Credit reporting', 'topic'] = 'Credit reporting, credit repair services, or other personal consumer reports'
df.loc[df['topic'] == 'Credit card', 'topic'] = 'Credit card or prepaid card'
df.loc[df['topic'] == 'Prepaid card', 'topic'] = 'Credit card or prepaid card'
df.loc[df['topic'] == 'Payday loan', 'topic'] = 'Payday loan, title loan, or personal loan'
df.loc[df['topic'] == 'Virtual currency', 'topic'] = 'Money transfer, virtual currency, or money service'
df.loc[df['topic'] == 'Money transfers', 'topic'] = 'Money transfer, virtual currency, or money service'
df = df[df['topic'] != 'Other financial service']

In [None]:
df['topic']\
  .value_counts()\
  .sort_values(ascending=False)\
  .plot(kind='bar',
        title='Number complaints in each topic')
plt.show()

In [None]:
def print_plot(index):
    example = df[df.index == index][['input', 'topic']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Topic:', example[1])
        
print_plot(10)

## Steps:
1. Convert all text to lower case.
2. Replace REPLACE_BY_SPACE_RE symbols by space in text.
3. Remove symbols that are in BAD_SYMBOLS_RE from text.
4. Remove “x” in text.
5. Remove stop words.
6. Remove digits in text.

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
stopwords.words('english')

In [None]:

import re

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

df['input'] = df['input'].apply(clean_text)

# Remove groups of numerical sequences with whitespace
df['input'] = df['input'].str.replace('\d+', '')

In [None]:
print_plot(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=20000)
tfidf_matrix = vectorizer.fit_transform(df['input'])

In [None]:
tfidf_matrix

In [None]:
import numpy as np
decoding = np.array(vectorizer.get_feature_names())

In [None]:
decoding[np.where(np.array(tfidf_matrix[10].todense())[0] != 0)]

In [None]:
print_plot(10)

# 3. Feature selection

In [None]:
Y = pd.get_dummies(df['topic']).values

In [None]:
from sklearn.feature_selection import SelectPercentile, f_classif

selector = SelectPercentile(f_classif, percentile=10)
selector.fit(tfidf_matrix, df['topic'])

In [None]:
X = selector.transform(tfidf_matrix).toarray()

In [None]:
X.shape

In [None]:
decoding[selector.get_support()]

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(df['topic'])

# 4. Hold out sampling

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df['topic'], test_size=0.3)

# 5. Naive Bayes

In [None]:
%%time
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)

In [None]:
%%time
score_train = model.score(X_train, y_train)

In [None]:
%%time
score_test = model.score(X_test, y_test)

In [None]:
print("\nTrain set score:", score_train)
print("Test set score:", score_test)

# 6. SVM

In [None]:
from sklearn.svm import LinearSVC
model = LinearSVC(class_weight='balanced')
model.fit(X_train, y_train)

In [None]:
res = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred=res, y_true=y_test))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred=res, y_true=y_test)

# 7. Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(class_weight='balanced', n_estimators=100, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
res = model.predict(X_test)
print(classification_report(y_pred=res, y_true=y_test))

---