In [1]:
import re
from textblob import TextBlob
import numpy as np
from textblob import Word
from textblob.classifiers import NaiveBayesClassifier
import nltk
from nltk.probability import FreqDist
import random
import pandas as pd
from textblob import TextBlob

## Read File, Select Desired Column

In [2]:
file=pd.read_csv('Data.csv').loc[:, ["SR Summary"]]["SR Summary"];
file[1]

'inspection, dielectric. leak at digger(fitting in cab) ,remove extension shaft from winch,emg. stop not working at controls, boom cuts out(wait or play with outriggers works)'

## Clean Up Input

In [3]:
clean=[re.sub(r"  +"," ",re.sub(r"(?![\w\s]).", " ", line)).lower().strip() for line in file] #remove non-alphaneumeric
clean[1]

'inspection dielectric leak at digger fitting in cab remove extension shaft from winch emg stop not working at controls boom cuts out wait or play with outriggers works'

## Extract Tokens, Bigrams, Trigrams

In [4]:
appended_summary=" ".join([line for line in clean])
tokens = nltk.word_tokenize(appended_summary)
freq = nltk.FreqDist(tokens)

In [6]:
freq.most_common(10)

[('service', 158),
 ('request', 155),
 ('inspection', 73),
 ('pm', 50),
 ('dielectric', 35),
 ('annual', 29),
 ('and', 29),
 ('road', 27),
 ('boom', 24),
 ('leak', 22)]

In [7]:
bigrams = [bigram for line in clean for bigram in list(nltk.bigrams(line.split(" ")))]
bigram_freq = nltk.FreqDist(list(bigrams))
sorted_bigram_freq = bigram_freq.most_common()
sorted_bigram_freq[0:10]

[(('service', 'request'), 155),
 (('road', 'service'), 27),
 (('inspection', 'pm'), 27),
 (('and', 'dielectric'), 19),
 (('pm', 'and'), 17),
 (('annual', 'inspection'), 15),
 (('repairs', 'from'), 14),
 (('annual', 'pm'), 13),
 (('from', 'inspection'), 13),
 (('dielectric', 'test'), 12)]

In [8]:
trigrams = [trigram for line in clean for trigram in list(nltk.trigrams(line.split(" ")))]
trigram_freq = nltk.FreqDist(list(trigrams))
sorted_trigram_freq = trigram_freq.most_common()
sorted_trigram_freq[0:10]

[(('road', 'service', 'request'), 26),
 (('pm', 'and', 'dielectric'), 17),
 (('repairs', 'from', 'inspection'), 13),
 (('inspection', 'pm', 'and'), 12),
 (('annual', 'inspection', 'ndt'), 6),
 (('and', 'dielectric', 'test'), 6),
 (('inspection', 'pm', 'dielectric'), 6),
 (('receive', 'prep', 'unit'), 5),
 (('annual', 'pm', 'and'), 5),
 (('inspection', '6', 'month'), 5)]

## POS Tagging

In [9]:
wnl = nltk.WordNetLemmatizer()

def verb_checker(pair):
    if pair[1].startswith("V"):
        
        return Word(pair[0]).lemmatize("v")
    else:
        return pair[0]

In [10]:
tagged = nltk.pos_tag(tokens)
new_tokens = [verb_checker(item) for item in tagged] 
new_tokens[:5]

['complete', 'foot', 'pedal', 'wire', 'inspection']

In [11]:
new_tokens_tagged = nltk.pos_tag(new_tokens)

In [12]:
dic_of_pos = {"VB":[], "JJ":[], "RB":[], "NN":[]}
for token in set(new_tokens):
    pair = nltk.pos_tag([token])[0]
    if pair[1].startswith("VB"):
        dic_of_pos["VB"].append(pair[0])
    if pair[1].startswith("JJ"):
        dic_of_pos["JJ"].append(pair[0])
    if pair[1].startswith("NN"):
        dic_of_pos["NN"].append(pair[0])
    if pair[1].startswith("RB"):
        dic_of_pos["RB"].append(pair[0])

In [13]:
new_tokens_freq = nltk.FreqDist(new_tokens)
new_tokens_freq.most_common(10)

[('service', 158),
 ('request', 155),
 ('inspection', 73),
 ('pm', 50),
 ('dielectric', 35),
 ('leak', 30),
 ('annual', 29),
 ('and', 29),
 ('road', 27),
 ('boom', 24)]

## Most Frequent Nouns/Verbs/Adjectives/Adverbs

In [14]:
nouns = dic_of_pos["NN"]
noun_freq = [(item, freq[item]) for item in nouns]
sorted_noun_freq = sorted(noun_freq, key = lambda x: x[1], reverse = True)
sorted_noun_freq[:10]

[('service', 158),
 ('request', 155),
 ('inspection', 73),
 ('pm', 50),
 ('dielectric', 35),
 ('road', 27),
 ('boom', 24),
 ('leak', 22),
 ('repairs', 17),
 ('test', 16)]

In [15]:
verbs = dic_of_pos["VB"]
verb_freq = [(item, new_tokens_freq[item]) for item in verbs]
sorted_verb_freq = sorted(verb_freq, key = lambda x: x[1], reverse = True)
sorted_verb_freq[:10]

[('be', 17),
 ('replace', 11),
 ('see', 2),
 ('slow', 2),
 ('do', 2),
 ('cracked', 1),
 ('resealed', 1),
 ('come', 1),
 ('leaking', 1),
 ('remove', 1)]

In [16]:
adj = dic_of_pos["JJ"]
adj_freq = [(item, freq[item]) for item in adj ]
sorted_adj_freq= sorted(adj_freq, key = lambda x:x[1], reverse = True)
sorted_adj_freq[:10]

[('annual', 29),
 ('upper', 9),
 ('inoperable', 4),
 ('lower', 3),
 ('new', 2),
 ('complete', 2),
 ('loose', 2),
 ('visual', 2),
 ('severe', 1),
 ('own', 1)]

In [17]:
adverbs = dic_of_pos["RB"]
adverb_freq = [(item, freq[item]) for item in adverbs]
sorted_adverb_freq = sorted(adverb_freq, key = lambda x: x[1], reverse = True)
sorted_adverb_freq[:10]

[('not', 11),
 ('down', 4),
 ('up', 3),
 ('close', 1),
 ('apart', 1),
 ('belly', 1),
 ('intermittently', 1),
 ('there', 1),
 ('correctly', 1),
 ('back', 1)]

In [18]:
def feature_provider(parameter_vector):
    feature_pool=[sorted_noun_freq, sorted_verb_freq, sorted_adj_freq, sorted_adverb_freq, sorted_bigram_freq, sorted_trigram_freq]
    features=[]
    for i in range(len(parameter_vector)):
        features+=[feature[0] for feature in feature_pool[i][0:parameter_vector[i]]]
    return list(set(features))

features_used = feature_provider([5,5,5,5,5,5])
features_used[0:5]

[('annual', 'inspection', 'ndt'),
 'apart',
 ('road', 'service', 'request'),
 'inoperable',
 'lower']