In [5]:
import numpy as np
import pandas as pd
import datetime
import re
import nltk
from nltk.stem import SnowballStemmer # Stemming
from nltk.tokenize import RegexpTokenizer # Tokenizing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans # K-Means Clusterting
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram, linkage, fcluster # Ward Clustering
from scipy.spatial import distance
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [6]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize_and_stem(text):

  text = str(text)
  text1 = text.lower()
  text1 = text1.replace('{html}', "") # Remove weblinks
  text1 = text1.replace('/p[', "")
  cleantext = re.sub(r'[^\w\s]', text1)
  rem_num = re.sub('[0-9]+', cleantext)
  tokens = tokenizer.tokenize(rem_num)
  stem_words = [stemmer.stem(w) for w in tokens]

  return stem_words

In [8]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

##Understanding TF-IDF:
###https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/

In [10]:
def tfidf_calc(comments, ngram, stopwords):

  tfidf_vectorizer = TfidfVectorizer(max_df = 0.5, max_features = 1000000, min_df = 0.05, use_idf = True, stop_words = stopwords, ngram_ranges = (1, 4))
  tfidf_matrix = tfidf_vectorizer.fit_transform(comments) # Fit vectorizer to complaints
  terms = tfidf_vectorizer.get_feature_names() # List of features used in the tf-idf matrix

  return tfidf_matrix, terms

##Difference between K-means and Hierarchial clustering:
###https://www.geeksforgeeks.org/difference-between-k-means-and-hierarchical-clustering/

In [11]:
def kmeans(tfidf_matrix, num_clusters):

  km = KMeans(n_clusters = num_clusters).fit(tfidf_matrix)
  clusters = km.labels_
  cluster_centers = km.cluster_centers
  return clusters, cluster_centers

def wardClustering(tfidf_matrix, max_d):

  dist = 1-cosine_similarity(tfidf_matrix)
  dist = np.clip(dist, 0, 1)
  dist_cnds = distance.squareform(dist, checks = True)

  linkage_matrix = ward(dist_cnds)
  clusters = fcluster(linkage_matrix, max_d, criterion = 'distance')
  clusters_df = pd.DataFrame(clusters, columns = ['cluster']).reset_index()
  cluster_centers = np.asarray(np.vstack(clusters_df.groupby('cluster').apply(lambda x: np.mean(tfidf_matrix[x['index'].tolist(), :], axis= 0))))

  return clusters, cluster_centers, linkage_matrix

In [13]:
def topNwords(clusters, cluster_centers, terms, n):
  order_centroids = cluster_centers.arg_sort()[:, ::-1]
  topN_ind = order_centroids[:, :n]
  words_matrix = np.array(terms[i] for j in topN_ind for i in j).reshape(topN_ind.shape)

  topNwords_df = pd.DataFrame(words_matrix, columns = ['top' + str(i) for i in range(1, n+1)])
  _, topNwords_df['size'] = np.unique(clusters, return_counts = True)

  return topNwords_df

In [15]:
def main(df, n_cluster, n_words, max_d, method):

  cluster_centers = np.zeros((5,10))

  tfidf_matrix, terms = tfidf_calc(finaldf['Consumer complaint narrative1'], (1, 3), stopwords)

  if method == 'kmeans':
    cluster, cluster_centers = kmeans(tfidf_matrix, n_cluster)
  elif method == 'ward':
    clusters, cluster_centers, linkage_matrix = wardClustering(tfidf_matrix, max_d)

  topNwords_df = topNwords(clusters, cluster_centers, terms, n_words)
  finaldf['cluster'] = pd.Series(clusters, index = df.index)

  if method == 'kmeans':
    return topNwords_df, df
  elif method == 'ward':
    return topNwords_df, df, linkage_matrix

In [16]:
def array2set(textArray):

  word_list = []
  for i in textArray:
    word_list.extend(i.split(' '))

  return set(word_list)

In [17]:
def alert(topNwords_df, df_week, threshold4vs3, threshold4vs2, threshold4vs1, floor, min_size, max_d, excludeTopics):

  cluster_countA = df_week.groupby(['cluster']).apply(lambda x: x.shape[0]).reset_index(name = 'volume')
  cluster_countB = df_week.groupby(['cluster', 'month']).apply(lambda x: x.shape[0]).reset_index(name = 'volume')

  cluster_count = cluster_countB.pivot(index = 'cluster', columns = 'month', values = 'volume')

  i = max(df_week['month'])
  j = i - 1
  q = min(df_week['month'])
  k = q+1

  week_rate1 = ((cluster_count[i]/cluster_count[i].sum())/(cluster_count[j]/cluster_count[j].sum()))
  week_rate2 = ((cluster_count[i]/cluster_count[i].sum())/(cluster_count[k]/cluster_count[k].sum()))
  week_rate3 = ((cluster_count[i]/cluster_count[i].sum())/(cluster_count[q]/cluster_count[q].sum()))

  topNwords_df['rate4vs3'] = week_rate1.values
  topNwords_df['rate4vs2'] = week_rate2.values
  topNwords_df['rate4vs1'] = week_rate3.values

  alert_topN = topNwords_df.loc[((cluster_countA['volume'] > floor) &
   ((topNwords_df['rate4vs3'] > threshold4vs3) | (topNwords_df['rate4vs2'] > threshold4vs2) | (topNwords_df['rate4vs1'] > threshold4vs1)
   | (cluster_countA['volume'] > min_size)))]

  index_lst = list(filter(lambda x: len(array2set(alert_topicN.iloc[x, :5].values).intersection(set(excludeTopics))) < 2, np.arrange(alert_topN.shape[0])))
  alert_topN = alert_topN.iloc[index_lst, :]
  alerts_details = df_week.loc[df_week['cluster'].isin(alert_topN['cluster'])]

  return alert_topN, alert_details

In [None]:
df = pd.read_csv('june22.csv', encoding = 'utf-8')
finaldf = df.dropna(subset = ['Consumer complaint narrative'])
finaldf.shape

In [None]:
threshold4vs3 = 1.2
threshold4vs2 = 1.4
threshold4vs1 = 1.6

max_d = 1.8

excludeTopics = ['scra', 'militari']

In [None]:
finaldf['Consumer complaint narrative1'] = finaldf['Consumer complaint narrative'].map(lambda s: tokenize_and_stem(s))

topNwords_df, df_week, linkage_matrix = main(finaldf, n_cluster = None, n_words = 5, max_d = max_d, method = 'Ward')