In [37]:
import re
from textblob import TextBlob
import numpy as np
from textblob import Word
from textblob.classifiers import NaiveBayesClassifier
import nltk
from nltk.probability import FreqDist
import random
import pandas as pd
from textblob import TextBlob
from sklearn.cluster import KMeans

## Read File, Select Desired Column

In [18]:
file=pd.read_csv('Data1.csv', encoding='ISO-8859-1', error_bad_lines=False).loc[:, ["SR Summary"]]["SR Summary"];
file[1]

  interactivity=interactivity, compiler=compiler, result=result)


'inspection, dielectric. leak at digger(fitting in cab) ,remove extension shaft from winch,emg. stop not working at controls, boom cuts out(wait or play with outriggers works)'

## Clean Up Input

In [19]:
clean=[re.sub(r"  +"," ",re.sub(r"(?![\w\s]).", " ",line).lower()).strip() for line in file] #remove non-alphaneumeric
clean=list(filter(lambda x: x!='road service request' and x!='service request', clean))
clean_grams=[re.sub(r"  +"," ",re.sub(r"( (and|or|not) )((and|or|not) )*"," ",line)).strip() for line in clean] #remove connection words
clean[1] # 49010 48831 48981 48802 re.sub(r" (and|or|not) "," ", )

'inspection dielectric leak at digger fitting in cab remove extension shaft from winch emg stop not working at controls boom cuts out wait or play with outriggers works'

## Extract Tokens, Bigrams, Trigrams, 4-Grams

In [20]:
appended_summary=" ".join([line for line in clean])
tokens = nltk.word_tokenize(appended_summary)
freq = nltk.FreqDist(tokens)

In [21]:
freq.most_common(10)

[('boom', 7516),
 ('inspection', 6413),
 ('pm', 6216),
 ('and', 5924),
 ('unit', 5312),
 ('leak', 4484),
 ('dielectric', 4020),
 ('not', 3835),
 ('in', 3612),
 ('at', 3495)]

In [22]:
bigrams = [bigram for line in clean_grams for bigram in list(nltk.bigrams(line.split(" ")))]
bigram_freq = nltk.FreqDist(list(bigrams))
sorted_bigram_freq = bigram_freq.most_common()
sorted_bigram_freq[0:10]

[(('pm', 'inspection'), 3098),
 (('dielectric', 'test'), 2664),
 (('leak', 'at'), 2054),
 (('inspection', 'dielectric'), 1508),
 (('pm', 'dielectric'), 1488),
 (('hydraulic', 'leak'), 1441),
 (('stuck', 'in'), 1221),
 (('unit', 'down'), 1067),
 (('boom', 'functions'), 1058),
 (('repairs', 'from'), 967)]

In [23]:
trigrams = [trigram for line in clean_grams for trigram in list(nltk.trigrams(line.split(" ")))]
trigram_freq = nltk.FreqDist(list(trigrams))
sorted_trigram_freq = trigram_freq.most_common()
sorted_trigram_freq[0:10]

[(('inspection', 'dielectric', 'test'), 1346),
 (('pm', 'inspection', 'dielectric'), 1070),
 (('level', 'b', 'derrick'), 628),
 (('boom', 'stuck', 'in'), 621),
 (('repairs', 'from', 'inspection'), 599),
 (('pm', 'dielectric', 'test'), 596),
 (('hydraulic', 'leak', 'at'), 570),
 (('stuck', 'in', 'the'), 547),
 (('in', 'the', 'air'), 543),
 (('b', 'derrick', '6s'), 524)]

In [24]:
fourgrams = [fourgram for line in clean_grams for fourgram in list(nltk.ngrams(line.split(" "), 4))]
fourgram_freq = nltk.FreqDist(list(fourgrams))
sorted_fourgram_freq = fourgram_freq.most_common()
sorted_fourgram_freq[0:10]

  """Entry point for launching an IPython kernel.


[(('pm', 'inspection', 'dielectric', 'test'), 963),
 (('level', 'b', 'derrick', '6s'), 523),
 (('stuck', 'in', 'the', 'air'), 518),
 (('boom', 'stuck', 'in', 'the'), 317),
 (('annual', 'pm', 'dielectric', 'test'), 315),
 (('inspection', 'annual', 'pm', 'dielectric'), 306),
 (('boom', 'stuck', 'in', 'air'), 269),
 (('annual', 'inspection', 'dielectric', 'test'), 157),
 (('check', 'perform', 'if', 'required'), 151),
 (('perform', 'if', 'required', 'csn'), 149)]

## POS Tagging

In [25]:
wnl = nltk.WordNetLemmatizer()

def verb_checker(pair):
    if pair[1].startswith("V"):
        
        return Word(pair[0]).lemmatize("v")
    else:
        return pair[0]

In [26]:
tagged = nltk.pos_tag(tokens)
new_tokens = [verb_checker(item) for item in tagged] 
new_tokens[:5]

['complete', 'foot', 'pedal', 'wire', 'inspection']

In [27]:
new_tokens_tagged = nltk.pos_tag(new_tokens)

In [28]:
dic_of_pos = {"VB":[], "JJ":[], "RB":[], "NN":[]}
for token in set(new_tokens):
    pair = nltk.pos_tag([token])[0]
    if pair[1].startswith("VB"):
        dic_of_pos["VB"].append(pair[0])
    if pair[1].startswith("JJ"):
        dic_of_pos["JJ"].append(pair[0])
    if pair[1].startswith("NN"):
        dic_of_pos["NN"].append(pair[0])
    if pair[1].startswith("RB"):
        dic_of_pos["RB"].append(pair[0])

In [29]:
new_tokens_freq = nltk.FreqDist(new_tokens)
new_tokens_freq.most_common(10)

[('boom', 7553),
 ('inspection', 6413),
 ('pm', 6216),
 ('leak', 5938),
 ('and', 5924),
 ('unit', 5312),
 ('dielectric', 4020),
 ('not', 3835),
 ('in', 3612),
 ('at', 3495)]

## Most Frequent Nouns/Verbs/Adjectives/Adverbs

In [30]:
nouns = dic_of_pos["NN"]
noun_freq = [(item, freq[item]) for item in nouns]
sorted_noun_freq = sorted(noun_freq, key = lambda x: x[1], reverse = True)
sorted_noun_freq[:10]

[('boom', 7516),
 ('inspection', 6413),
 ('pm', 6216),
 ('unit', 5312),
 ('leak', 4484),
 ('dielectric', 4020),
 ('test', 2929),
 ('pole', 2821),
 ('auger', 2765),
 ('winch', 2749)]

In [31]:
verbs = dic_of_pos["VB"]
verb_freq = [(item, new_tokens_freq[item]) for item in verbs]
sorted_verb_freq = sorted(verb_freq, key = lambda x: x[1], reverse = True)
sorted_verb_freq[:10]

[('replace', 2466),
 ('be', 2270),
 ('slow', 935),
 ('have', 631),
 ('go', 594),
 ('come', 401),
 ('do', 307),
 ('make', 261),
 ('get', 241),
 ('lose', 195)]

In [32]:
adj = dic_of_pos["JJ"]
adj_freq = [(item, freq[item]) for item in adj ]
sorted_adj_freq= sorted(adj_freq, key = lambda x:x[1], reverse = True)
sorted_adj_freq[:10]

[('inoperable', 1294),
 ('annual', 1261),
 ('upper', 1233),
 ('lower', 631),
 ('new', 338),
 ('loose', 269),
 ('turntable', 264),
 ('high', 259),
 ('third', 253),
 ('bad', 242)]

In [33]:
adverbs = dic_of_pos["RB"]
adverb_freq = [(item, freq[item]) for item in adverbs]
sorted_adverb_freq = sorted(adverb_freq, key = lambda x: x[1], reverse = True)
sorted_adverb_freq[:10]

[('not', 3835),
 ('down', 2043),
 ('up', 1020),
 ('intermittently', 280),
 ('only', 244),
 ('back', 190),
 ('properly', 146),
 ('too', 124),
 ('very', 122),
 ('correctly', 117)]

In [34]:
def get_str(vec):
    if isinstance(vec, tuple):
        return " ".join(vec).strip()
    return vec.strip()

def feature_provider(parameter_vector):
    feature_pool=[sorted_noun_freq, sorted_verb_freq, sorted_adj_freq, sorted_adverb_freq, sorted_bigram_freq]
    features=[]
    for i in range(len(parameter_vector)):
        features+=[get_str(feature[0]) for feature in feature_pool[i][0:parameter_vector[i]]]
    return list(set(features))

features_used = feature_provider([100,100,130,110,100])
features_used[0:5]

['digger', 'properly', 'together', 'later', 'operation']

In [60]:
binary=[]
for index in range(0, len(clean)):
    binary += ([[int(feature in clean[0]) for feature in features_used[:-100]] + [int(feature in clean_grams[0]) for feature in features_used[-100:]]])
binary[0:2]

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [1]:
kmeans = KMeans(n_clusters=16, random_state=0)
kmeans = kmeans.fit(binary)

NameError: name 'KMeans' is not defined