In [1]:
import re
from textblob import TextBlob
import numpy as np
from textblob import Word
from textblob.classifiers import NaiveBayesClassifier
import nltk
from nltk.probability import FreqDist
import random
import pandas as pd
from textblob import TextBlob
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier

## Read File, Select Desired Column

In [2]:
file=pd.read_csv('Data3.csv', encoding='ISO-8859-1', error_bad_lines=False).loc[:, ["SR_NOTE_CORRECTION"]]["SR_NOTE_CORRECTION"];
file[1]

  interactivity=interactivity, compiler=compiler, result=result)


'-- CUT OFF AND REPLACED DAMAGED AREA OF REAR BUMPER - INSTALLED NEW HINGES - PAINTED AREA\n-- STRAIGHTENED TOW EYES\n-- REPLACED AUGER BRACKET ROLL PIN\n-- REPLACED AND ADJUSTED TRANSFER PIN \n-- REPAIRED BOOM GEL COTE\n-- REPAIRED HOSE CARRIER TUBE; - COMPLETED ANNUAL PM INPECTION\n- COMPLETED DIELECTRIC TEST\n- REMOVED AND REPLACED HOSE REEL\n- REMOVED HOSE CLAMP TIES TO CHECK HOSES UNDER CHASSIS FOR DAMAGE - ADDED HOSE PROTECTION\n- CLEANED TRANSFER TUBES - TIGHTENED FITTINGS\n- REINSTALLED TWO SPEED FITTING AT DIGGER \n- REPLACED POLE GUIDE BUSHINGS AND SHIMS\n- ADJUSTED TRANSFER BRACKET\n- TIGHTENED FITTING AT DIGGER MOTOR\n- ADJUSTED POLE GUIDE INTERLOCK\n- TIGHTENED FITTING AT TURRET\n- TOPPED UP HYDRAULIC OIL\n- REPLACED GASKET AT RETURN ON HYDRAULIC TANK\n- REPLACED MALE TOOL COUPLERS AT HOSE REEL\n- REPLACED DAMAGED AND MISSING DUST CAPS ON TOOL COUPLERS\n- REMOVED CAPSTAN EXTENSION - WAS SEIZED - HAD TO CUT OFF - CLEANED UP SHAFT - REPLACED EXTENSION\n- COMPLETED OPERATION 

## Clean Up Input

In [3]:
clean=[re.sub(r"  +"," ",re.sub(r"((?![\w\s]).|\n)", " ",line).lower()).strip() for line in file] #remove non-alphaneumeric
# clean=list(filter(lambda x: x!='road service request' and x!='service request', clean))
clean_grams=[re.sub(r"  +"," ",re.sub(r"( (and|or|not) )((and|or|not) )*"," ",line)).strip() for line in clean] #remove connection words
clean[1] # 49010 48831 48981 48802 re.sub(r" (and|or|not) "," ", )

'cut off and replaced damaged area of rear bumper installed new hinges painted area straightened tow eyes replaced auger bracket roll pin replaced and adjusted transfer pin repaired boom gel cote repaired hose carrier tube completed annual pm inpection completed dielectric test removed and replaced hose reel removed hose clamp ties to check hoses under chassis for damage added hose protection cleaned transfer tubes tightened fittings reinstalled two speed fitting at digger replaced pole guide bushings and shims adjusted transfer bracket tightened fitting at digger motor adjusted pole guide interlock tightened fitting at turret topped up hydraulic oil replaced gasket at return on hydraulic tank replaced male tool couplers at hose reel replaced damaged and missing dust caps on tool couplers removed capstan extension was seized had to cut off cleaned up shaft replaced extension completed operation test checked operation of boom cuts out sometimes no fault found inspected and cleaned emerg

## Extract Tokens, Bigrams, Trigrams, 4-Grams

In [4]:
appended_summary=" ".join([line for line in clean_grams])
tokens = nltk.word_tokenize(appended_summary)
freq = nltk.FreqDist(tokens)

In [5]:
freq.most_common(10)

[('to', 194566),
 ('unit', 134989),
 ('the', 132311),
 ('from', 68426),
 ('on', 54740),
 ('for', 46371),
 ('boom', 45889),
 ('at', 43880),
 ('found', 40540),
 ('in', 40132)]

In [6]:
bigrams = [bigram for line in clean_grams for bigram in list(nltk.bigrams(line.split(" ")))]
bigram_freq = nltk.FreqDist(list(bigrams))
sorted_bigram_freq = bigram_freq.most_common()
sorted_bigram_freq[0:10]

[(('unit', 'to'), 18043),
 (('traveled', 'from'), 15511),
 (('to', 'service'), 14850),
 (('travel', 'from'), 13771),
 (('set', 'up'), 13393),
 (('the', 'unit'), 13375),
 (('pole', 'guide'), 11342),
 (('up', 'unit'), 10772),
 (('installed', 'new'), 9189),
 (('traveled', 'to'), 9154)]

In [None]:
trigrams = [trigram for line in clean_grams for trigram in list(nltk.trigrams(line.split(" ")))]
trigram_freq = nltk.FreqDist(list(trigrams))
sorted_trigram_freq = trigram_freq.most_common()
sorted_trigram_freq[0:10]

In [None]:
fourgrams = [fourgram for line in clean_grams for fourgram in list(nltk.ngrams(line.split(" "), 4))]
fourgram_freq = nltk.FreqDist(list(fourgrams))
sorted_fourgram_freq = fourgram_freq.most_common()
sorted_fourgram_freq[0:10]

## POS Tagging

In [7]:
wnl = nltk.WordNetLemmatizer()

def verb_checker(pair):
    if pair[1].startswith("V"):
        
        return Word(pair[0]).lemmatize("v")
    else:
        return pair[0]

In [8]:
tagged = nltk.pos_tag(tokens)
new_tokens = [verb_checker(item) for item in tagged] 
new_tokens[:5]

['1', 'cut', 'off', 'replace', 'damage']

In [None]:
new_tokens_tagged = nltk.pos_tag(new_tokens)

In [92]:
dic_of_pos = {"VB":[], "JJ":[], "RB":[], "NN":[]}
for token in set(new_tokens):
    pair = nltk.pos_tag([token])[0]
    if pair[1].startswith("VB"):
        dic_of_pos["VB"].append(pair[0])
    if pair[1].startswith("JJ"):
        dic_of_pos["JJ"].append(pair[0])
    if pair[1].startswith("NN"):
        dic_of_pos["NN"].append(pair[0])
    if pair[1].startswith("RB"):
        dic_of_pos["RB"].append(pair[0])

In [93]:
new_tokens_freq = nltk.FreqDist(new_tokens)
new_tokens_freq.most_common(10)

[('boom', 7553),
 ('inspection', 6413),
 ('pm', 6216),
 ('leak', 5938),
 ('and', 5924),
 ('unit', 5312),
 ('dielectric', 4020),
 ('not', 3835),
 ('in', 3612),
 ('at', 3495)]

## Most Frequent Nouns/Verbs/Adjectives/Adverbs

In [158]:
nouns = dic_of_pos["NN"]
noun_freq = [(item, freq[item]) for item in nouns]
sorted_noun_freq = sorted(noun_freq, key = lambda x: x[1], reverse = True)
sorted_noun_freq[:100]

[('boom', 7516),
 ('inspection', 6413),
 ('pm', 6216),
 ('unit', 5312),
 ('leak', 4484),
 ('dielectric', 4020),
 ('test', 2929),
 ('pole', 2821),
 ('auger', 2765),
 ('winch', 2749),
 ('ucr', 2449),
 ('hydraulic', 2369),
 ('functions', 2166),
 ('rotation', 1819),
 ('controls', 1800),
 ('repairs', 1779),
 ('inop', 1564),
 ('stuck', 1356),
 ('check', 1338),
 ('digger', 1326),
 ('dot', 1297),
 ('throttle', 1289),
 ('perform', 1253),
 ('stage', 1238),
 ('hose', 1194),
 ('issues', 1186),
 ('air', 1141),
 ('broken', 1141),
 ('repair', 1113),
 ('outrigger', 1089),
 ('csn', 1081),
 ('hyd', 996),
 ('guide', 990),
 ('oil', 984),
 ('install', 968),
 ('control', 961),
 ('pto', 882),
 ('t', 873),
 ('derrick', 845),
 ('intermittent', 839),
 ('hop', 809),
 ('stow', 784),
 ('level', 780),
 ('function', 736),
 ('b', 714),
 ('cylinder', 700),
 ('won', 678),
 ('valve', 669),
 ('switch', 652),
 ('pin', 622),
 ('front', 615),
 ('remote', 607),
 ('issue', 600),
 ('needs', 594),
 ('gearbox', 585),
 ('tool', 5

In [159]:
verbs = dic_of_pos["VB"]
verb_freq = [(item, new_tokens_freq[item]) for item in verbs]
sorted_verb_freq = sorted(verb_freq, key = lambda x: x[1], reverse = True)
sorted_verb_freq[:100]

[('replace', 2466),
 ('be', 2270),
 ('slow', 935),
 ('have', 631),
 ('go', 594),
 ('come', 401),
 ('do', 307),
 ('make', 261),
 ('get', 241),
 ('lose', 195),
 ('remove', 194),
 ('run', 191),
 ('add', 162),
 ('leaking', 155),
 ('leave', 100),
 ('sling', 91),
 ('keep', 91),
 ('fell', 71),
 ('see', 56),
 ('swing', 54),
 ('take', 41),
 ('let', 32),
 ('cracked', 22),
 ('apply', 20),
 ('find', 17),
 ('losing', 15),
 ('multifunctioning', 14),
 ('energize', 14),
 ('popping', 12),
 ('making', 11),
 ('believe', 10),
 ('oring', 10),
 ('digging', 10),
 ('extended', 10),
 ('loosing', 10),
 ('follow', 10),
 ('lost', 9),
 ('approve', 9),
 ('damaged', 8),
 ('busted', 8),
 ('approved', 8),
 ('say', 8),
 ('appear', 7),
 ('replaced', 7),
 ('allow', 7),
 ('sticking', 7),
 ('grinding', 7),
 ('working', 7),
 ('attached', 6),
 ('rehose', 6),
 ('cavitating', 5),
 ('retractted', 5),
 ('needed', 5),
 ('feathering', 5),
 ('give', 5),
 ('enclose', 5),
 ('engaged', 4),
 ('lifting', 4),
 ('phased', 4),
 ('cutting',

In [96]:
adj = dic_of_pos["JJ"]
adj_freq = [(item, freq[item]) for item in adj ]
sorted_adj_freq= sorted(adj_freq, key = lambda x:x[1], reverse = True)
sorted_adj_freq[:10]

[('inoperable', 1294),
 ('annual', 1261),
 ('upper', 1233),
 ('lower', 631),
 ('new', 338),
 ('loose', 269),
 ('turntable', 264),
 ('high', 259),
 ('third', 253),
 ('bad', 242)]

In [97]:
adverbs = dic_of_pos["RB"]
adverb_freq = [(item, freq[item]) for item in adverbs]
sorted_adverb_freq = sorted(adverb_freq, key = lambda x: x[1], reverse = True)
sorted_adverb_freq[:10]

[('not', 3835),
 ('down', 2043),
 ('up', 1020),
 ('intermittently', 280),
 ('only', 244),
 ('back', 190),
 ('properly', 146),
 ('too', 124),
 ('very', 122),
 ('correctly', 117)]

In [205]:
def get_str(vec):
    if isinstance(vec, tuple):
        return " ".join(vec).strip()
    return vec.strip()

def feature_provider(parameter_vector):
    feature_pool=[sorted_noun_freq, sorted_verb_freq, sorted_adj_freq, sorted_adverb_freq, sorted_bigram_freq]
    features=[]
    for i in range(len(parameter_vector)):
        features+=[get_str(feature[0]) for feature in feature_pool[i][0:parameter_vector[i]]]
    return list(set(features))

features_used = feature_provider([1000,0,0,0,20])
np.savetxt("features2.csv", np.array(features_used), delimiter=",", fmt="%s")
features_used[0:5]

['rotary', 'won t', 'broken', 'boom functions', 'upper controls']

In [99]:
binary=[]
for index in range(0, len(clean)):
    binary += ([[int(feature in clean[index]) for feature in features_used[:-100]] + [int(feature in clean_grams[0]) for feature in features_used[-100:]]])
binary[0:2]

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
