In [1]:
import os
import numpy as np
import pandas as pd
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn import preprocessing
import matplotlib.pyplot as plt 

# nltk.download("punkt")
# nltk.download("stopwords")
# nltk.download("wordnet")

In [2]:
df= pd.read_csv('combined_data_final.csv')

In [3]:
# Combine event narrative, injury list, and pertinent info of a case.
columns= ['pertinent_info','injury_list','event_narrative']
df[columns]= df[columns].fillna(value='')
df['text']= pd.DataFrame(df['event_narrative'] + '; ' + df['injury_list'] + '; ' + df['pertinent_info'])

In [4]:
df_words= df.loc[:,['case_name','state','text','amount']].copy()

In [5]:
df_words.to_csv('words.csv')

In [6]:
# Pick one state
state= 'NJ'
df_state= df_words[df_words['state']==state].copy()

In [7]:
# Min max scaling of amount
min_max_scaler= preprocessing.MinMaxScaler()
df_state['amount_scaled']= min_max_scaler.fit_transform(df_state[['amount']])

In [8]:
df_state

Unnamed: 0,case_name,state,text,amount,amount_scaled
8,"massey v. nj transit corporation, 2014 nj jury...",NJ,"the defendant bus driver, who was making a lef...",1250000,0.081933
9,"downey v. plummer, 2013 nj jury verdicts revie...",NJ,the plaintiff went to the emergency room with ...,1500000,0.099255
10,"smith v. brazel, 2013 nj jury verdicts review ...",NJ,the defendant's vehicle struck the plaintiff's...,2250000,0.151221
11,"brattan v. wilbert, 2014 nj jury verdicts revi...",NJ,the plaintiff was seated behind the handlebars...,745000,0.046943
12,"woods v. montgomery, 2012.nj jury verdicts rev...",NJ,the plaintiff alleged that the defendant negli...,300000,0.016109
28,"francisco v. wang, 2012 nj jury verdicts revie...",NJ,the plaintiff underwent an open oophorectomy a...,6300000,0.431838
29,james wilkerson v. campbell's auto express and...,NJ,the decedent was injured on the job while oper...,4500000,0.307119
30,"weiss v. swift, 2013 nj jury verdicts review l...",NJ,the plaintiff alleged that the defendant faile...,750000,0.047289
31,"robinson v. truck-tech, inc., et al., 2012 nj ...",NJ,the plaintiff alleged that the defendant truck...,2000000,0.133899
32,"garofano v. advanced imaging assoc., et al., 2...",NJ,the plaintiff had an ultrasound of the gall bl...,7000000,0.480340


In [9]:
our_stop_words= set(['plaintiff','plaintiffs','defendant','defendants','alleged','allegedly'])
stop_words= set(stopwords.words('english')).union(our_stop_words)
print(stop_words)

{'had', 'her', 'while', "you're", 'very', 've', 'it', 'from', 'those', 'shan', "weren't", 'did', 'won', 'll', 'few', 'mustn', 'not', 'weren', 'shouldn', 'that', "wouldn't", 'our', 'before', 'needn', 'to', 'wasn', 'ourselves', 'i', 'again', 'most', 'other', 'were', 'd', 're', "mustn't", 'under', 'aren', 'yourself', 'being', 'above', 'over', 'any', 'where', 'against', 'just', 'its', 'defendant', 'myself', "it's", 'mightn', 'yours', 'further', 'when', "won't", "haven't", 'their', 'do', 'a', 'down', "didn't", 'off', 'how', 'both', "couldn't", 'more', 't', "mightn't", 'we', 'hasn', "wasn't", 'now', 'only', 'through', 'in', 'am', 'and', 'hers', "that'll", 'he', 'you', 'alleged', 'isn', 'so', 'into', 'these', 'who', 'nor', 'whom', "needn't", 'such', 'ma', 'during', 'don', 'was', 'some', "don't", 'an', 'your', 'them', 'ain', "you'd", "shan't", 'doesn', "you'll", 'should', "isn't", 'his', 's', 'all', 'as', 'why', 'out', 'defendants', 'the', 'has', 'if', "you've", 'for', "hadn't", 'or', 'him', '

In [10]:
# Tokenize a text. Discard a token if it is shorter than len_threshold. 
def tokenize(text, len_threshold):
    wnl= WordNetLemmatizer()
    porter = PorterStemmer()
    
    # Remove punctuation and numbers.
    text= re.sub('[%s\d]' % string.punctuation, ' ', text)
    
    # Tokenize, lemmatize, and stem. 
    tokens= word_tokenize(text)
    tokens= [t for t in tokens if t not in stop_words and len(t) >= len_threshold]
    tokens= [wnl.lemmatize(t) for t in tokens]
#     tokens= [wnl.lemmatize(t,'v') for t in tokens_lemma]
    tokens = [porter.stem(t) for t in tokens]
    return tokens

In [11]:
df_idx= df_state.index
all_tokens= {}     # All tokenized texts. Key: index in dataframe, value: tokenized text. 
words_count= {}    # Count of words in all tokenized texts. Key: word, value: word count. 
len_threshold= 2   # Discard a token if it is shorter than len_threshold. 
for i in range(len(df_state)):
    idx= df_idx[i]
    text= df_state.loc[idx,'text']
    tokens= tokenize(text, len_threshold)
    all_tokens[idx]= tokens

    # Count words in all tokenized texts.
    for t in tokens:
        if t in words_count:
            words_count[t]+= 1
        else:
            words_count[t]= 1

# Vocabulary list
vocab= sorted(words_count.keys())
with open('vocab_' + state + '.txt', 'w') as f_out:
    for word in vocab:
        f_out.write(word + '\n')

In [12]:
print(all_tokens)
print(words_count)
print(vocab)

{8: ['bu', 'driver', 'make', 'left', 'turn', 'struck', 'crosswalk', 'sever', 'abdomin', 'wound', 'cervic', 'lumbar', 'herniat'], 9: ['went', 'emerg', 'room', 'abdomin', 'pain', 'neglig', 'fail', 'properli', 'diagnos', 'result', 'ruptur', 'appendix', 'ruptur', 'appendix', 'remov', 'ascend', 'colon', 'part', 'small', 'intestin', 'ileocec', 'valv', 'juri', 'award', 'parti', 'settl', 'amount', 'polici'], 10: ['vehicl', 'struck', 'vehicl', 'head', 'multipl', 'fractur', 'abdomin', 'scar', 'close', 'head', 'trauma', 'extens', 'intern', 'injuri', 'settl', 'primari', 'insur', 'excess', 'insur'], 11: ['seat', 'behind', 'handlebar', 'terrain', 'vehicl', 'atv', 'reach', 'began', 'oper', 'atv', 'atv', 'toppl', 'land', 'abdomen', 'bowel', 'perfor', 'abdomin', 'scar', 'made', 'incom', 'claim'], 12: ['neglig', 'lost', 'control', 'vehicl', 'cross', 'centerlin', 'caus', 'collid', 'head', 'vehicl', 'abdomin', 'scar', 'ankl', 'fractur', 'wrist', 'fractur', 'rib', 'coverag', 'underinsur', 'motorist', 'uim'

In [13]:
# Create dataframe for bag of words.
df_bow = pd.DataFrame(0, index=df_idx, columns=vocab)
for idx, tokens in all_tokens.items():
    for t in tokens:
        df_bow.loc[idx, t]+= 1

In [14]:
df_bow.to_csv('WIW_bow_' + state + '.csv')

In [15]:
df_bow

Unnamed: 0,abandon,abdomen,abdomin,abdomina,abl,ablat,abnorm,abras,abscess,abus,...,would,wound,wrist,written,wrong,year,yellow,young,youth,şeyer
8,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
28,0,0,4,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
29,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
31,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [19]:
df_amount_scaled= df_state[['amount_scaled']].copy()

In [21]:
df_amount_scaled

Unnamed: 0,amount_scaled
8,0.081933
9,0.099255
10,0.151221
11,0.046943
12,0.016109
28,0.431838
29,0.307119
30,0.047289
31,0.133899
32,0.480340


In [132]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [133]:
df_X= df_bow
df_y= df_amount_scaled.values.ravel()

In [134]:
train_size= round(len(df_X)*0.7)
df_X_train= df_X.loc[df_X.index[0:train_size],:].copy()
df_X_test= df_X.loc[df_X.index[train_size:],:].copy()
df_y_train= df_y[0:train_size]
df_y_test= df_y[train_size:]

In [135]:
linreg= LinearRegression()

In [136]:
print(len(df_X_train))
print(len(df_y_train))
print(len(df_X_test))
print(len(df_y_test))

108
108
46
46


In [137]:
linreg.fit(df_X_train, df_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [141]:
print('Training R^2: ' + str(linreg.score(df_X_train, df_y_train)))
print('Testing R^2: ' + str(linreg.score(df_X_test, df_y_test)))

Training R^2: 1.0
Testing R^2: 0.117852848028


In [142]:
coef= linreg.coef_
coef_sorted_idx= np.argsort(coef)
n= 10
top_idx= coef_sorted_idx[-n:]
bot_idx= coef_sorted_idx[:n]
top= [vocab[i] for i in top_idx]
top_coef= [coef[i] for i in top_idx]
bot= [vocab[i] for i in bot_idx]
bot_coef= [coef[i] for i in bot_idx]
print(top)
print(top_coef)
print(bot)
print(bot_coef)

['forc', 'produc', 'pitch', 'pal', 'alloy', 'cardiac', 'brain', 'trauma', 'chest', 'aluminum']
[0.038076753573083572, 0.038076753573083572, 0.038076753573083572, 0.038076753573083572, 0.038215843118550176, 0.040392832893452142, 0.044219311269457898, 0.047224348769895522, 0.056446664331585877, 0.076033455782026124]
['close', 'arm', 'screen', 'head', 'pulmonari', 'behind', 'tray', 'patron', 'tv', 'make']
[-0.021199966789365844, -0.020293005912936025, -0.016619387354815083, -0.016485337929003974, -0.015867524556077334, -0.01545765837791741, -0.015453524336025655, -0.015060383137018312, -0.013695008788912555, -0.013365711898809174]


In [None]:
# To be completed...

In [143]:
rfreg = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=0)

In [144]:
rfreg.fit(df_X, df_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

In [145]:
rfreg.oob_score_

0.20687978850169131

In [151]:
importance= rfreg.feature_importances_
sorted_idx= np.argsort(importance)
n= 20
top_idx= sorted_idx[-n:]
bot_idx= sorted_idx[:n]
bot_idx= [i for i in sorted_idx if importance[i] == 0.0]
top= [vocab[i] for i in top_idx]
top_imp= [importance[i] for i in top_idx]
bot= [vocab[i] for i in bot_idx]
bot_imp= [importance[i] for i in bot_idx]
print(top)
print(top_imp)
print(bot)
print(bot_imp)

['finger', 'quod', 'pal', 'report', 'produc', 'bat', 'aluminum', 'failur', 'claim', 'care', 'damag', 'alloy', 'easili', 'forc', 'wife', 'sever', 'game', 'profound', 'pitch', 'le']
[0.013122855161521974, 0.01364354816584392, 0.014027013202320075, 0.016403565264912399, 0.018095798180034459, 0.019562241468008456, 0.01959206035363199, 0.021300016772040554, 0.02130996603882164, 0.021830440497601234, 0.023324972197346406, 0.025806011458072292, 0.026212203890632786, 0.027559600647784389, 0.02816455522564338, 0.029419727651773375, 0.029875526197682439, 0.034821687309730176, 0.0370610424943792, 0.04200261346528001]
['abandon', 'period', 'conducta', 'benefit', 'plantar', 'plate', 'prednison', 'fecal', 'fascia', 'presenc', 'present', 'factor', 'procedur', 'ex', 'pubic', 'penetr', 'backup', 'chimney', 'rail', 'ray', 'recoveri', 'automobil', 'recuper', 'elev', 'releas', 'renov', 'requir', 'cage', 'atrial', 'revers', 'roof', 'pull', 'pe', 'par', 'biopsi', 'burst', 'intermitt', 'ischem', 'karyotyp', 

In [153]:
importance[vocab.index('die')]

0.00050750499112247997