In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report

In [76]:
df = pd.read_csv('data/processed_text.csv')

In [77]:
df.drop(columns='Unnamed: 0', axis=1, inplace=True)

In [78]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,YEAR,NEAREST_TOWN,NEAREST_ROAD,OBSERVED,ALSO_NOTICED,OTHER_WITNESSES,OTHER_STORIES,TIME_AND_CONDITIONS,ENVIRONMENT,REPORT_NUMBER,location_details,county,state,season,title,latitude,longitude,number,classification,geohash,temperature_high,temperature_mid,temperature_low,dew_point,humidity,cloud_cover,moon_phase,precip_intensity,precip_probability,precip_type,pressure,summary,uv_index,visibility,wind_bearing,wind_speed,preprocessed_text
0,2010,Double Springs,Highway 33,I was canoeing on the Sipsey river in Alabama....,3 hours before on river while I paused to fix ...,none,"No, but I've had a couple of weird things happ...","Dusk, was a clear day in about the high 80s te...",River running through a very remote forest are...,30680.0,,Winston County,Alabama,Summer,,,,30680.0,0,,,,,,,,,,,,,,,,,,"['canning', 'tipsy', 'river', 'alabama', 'dusk..."
1,1990,,,Ed L. was salmon fishing with a companion in P...,,On a commercial fishing boat at anchor at the ...,,"Early Fall, in the early 1990's.",,1261.0,East side of Prince William Sound,Valdez-Chitina-Whittier County,Alaska,Fall,,,,1261.0,1,,,,,,,,,,,,,,,,,,"['salmon', 'fishing', 'companion', 'prince', '..."
2,1974,Wakefield,Perry Ave.,"While attending U.R.I in the Fall of 1974,I wo...",none,none,"White gorilla was seen in a gravel pit, by a f...","10:00pm, very dark cool night.","Typical new England. Oak,Pine and Maple trees....",6496.0,"Great swamp area, Narragansett Indians",Washington County,Rhode Island,Fall,Report 6496: Bicycling student has night encou...,41.45,-71.5,6496.0,1,drm5ucxrc0,78.17,73.425,68.68,65.72,0.86,0.86,0.16,0.0,0.0,,1020.61,Foggy until afternoon.,4.0,2.75,198.0,6.92,"['attending', 'would', 'stay', 'girlfriend', '..."
3,1972,York; more specifically Manchester twp.,Raintree Road,"Hello, My name is Doug and though I am very re...",My friend noted that he had spent the night be...,3. Don't remember the name of the third. He wa...,Just noting that my friend spent the night bef...,Incident occured somewhere between the hours o...,,8000.0,I would rather not have exact location (listin...,York County,Pennsylvania,Summer,,,,8000.0,0,,,,,,,,,,,,,,,,,,"['hello', 'name', 'though', 'reluctant', 'post..."
4,1984,Yamhill,NW Fairdale,It was May 1984. Two friends and I were up in ...,Not sure. It took me a full 5 years to be able...,There were two other wittness' other than me. ...,,I know that it was May of 1984 because it was ...,Gravel logging road. Very dense fir forests. B...,703.0,"Logging roads north west of Yamhill, OR, about...",Yamhill County,Oregon,Spring,,,,703.0,0,,,,,,,,,,,,,,,,,,"['may', 'two', 'friend', 'hill', 'yamhill', 'l..."


In [79]:
#defining the predictor/target variables
X = df['preprocessed_text']
y = df['classification']

In [80]:
X.shape

(4952,)

In [81]:
y.shape

(4952,)

In [82]:
#splitting the dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=88)

In [83]:
#creating a pipeline
prc_steps = [('countvec', CountVectorizer(min_df = 0.15, max_df = 0.95))] #cutoff, if frequency occurs under 5% or over 95% remove
preprocess_pipeline = Pipeline(prc_steps)

In [84]:
#vectorizing the text
X_train_proc = preprocess_pipeline.fit_transform(X_train)

In [85]:
X_train_proc.shape

(3714, 153)

In [86]:
#creating a bag of words 
feat_names = preprocess_pipeline[
    'countvec'].get_feature_names_out()

pd.DataFrame(X_train_proc.toarray(), columns = feat_names)

Unnamed: 0,across,almost,along,also,animal,another,anything,area,arm,around,away,back,bear,behind,believe,big,bigfoot,black,brown,came,camp,car,close,come,coming,could,covered,creature,dark,day,decided,deer,direction,dog,driving,even,ever,eye,face,fighting,first,foot,forest,found,friend,front,get,going,gone,good,got,ground,hair,happened,head,hear,heard,high,hill,home,hour,house,human,hunting,know,large,last,later,left,leg,light,like,little,long,look,looked,looking,loud,made,make,man,maybe,mile,minute,morning,moving,much,near,never,next,night,noise,nothing,noticed,old,one,outside,people,ran,really,right,road,run,running,said,saw,say,scared,second,see,seemed,seen,several,side,since,small,someone,something,sound,sounded,standing,started,still,stood,stopped,sure,take,tall,tell,thing,think,thought,three,time,told,took,top,towards,track,trail,tree,turned,two,walk,walked,walking,way,well,went,wood,would,yard,year
0,0,1,0,0,1,0,0,2,0,2,0,1,0,0,0,1,0,0,0,1,2,3,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,2,2,0,1,1,0,1,0,5,0,0,0,0,0,1,0,0,0,2,1,0,0,1,0,6,3,0,2,1,4,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,1,0,1,0,1,2,0,1,0,0,0,1,0,0,0,0,1,3,0,0,1,1,0,0,0,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,1,0,2,0,1,0,1
1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
2,0,0,0,2,0,2,1,1,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,1,1,1,1,1,0,0,0,0,1,0,0,2,0,1,0,2,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,3,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,3,0,1,0,5,0,0,0,5,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,1,0
3,0,0,0,0,0,0,1,0,0,1,2,0,0,0,0,0,2,0,0,0,3,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,5,0,1,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,2,2,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,1,3,1
4,0,0,0,1,0,0,1,0,0,0,0,1,0,2,2,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,0,4,0,3,2,0,0,0,0,1,0,0,0,1,0,0,0,0,3,0,0,2,0,1,0,0,0,0,0,3,0,0,1,1,1,0,0,0,1,3,1,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,1,5,2,0,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,0,0,0,1,0,0,1,0,2,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3709,0,0,0,1,0,0,1,0,0,0,1,3,0,0,1,0,0,0,0,1,0,1,0,0,2,3,0,0,1,0,0,0,0,0,0,1,0,0,3,0,1,5,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,3,0,1,0,0,1,0,2,1,0,2,1,0,2,2,0,3,0,0,2,3,1,1,0,2,0,0,0,1,0,0,0,0,2,1,0,0,0,3,1,3,1,0,0,2,0,0,0,0,0,0,0,1,2,0,0,2,1,0,0,0,0,2,0,0,0,0,5,1,0,2,0,0,0,2,2,0,2,0,1,1,0,0,1,0,0,0,0
3710,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0
3711,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,0,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0
3712,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,2,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,1,0,0


In [87]:
#getting the probability of the classifications
class_priors = y_train.value_counts()/y_train.shape[0]
class_priors

classification
0    0.50377
1    0.49623
Name: count, dtype: float64

### Using TF-IDF Vectorizer

In [88]:
#creating a pipeline using tfidf vectorizer 
prc_steps_tfidf = [('tfidfvec', TfidfVectorizer(min_df = 0.25, max_df = 0.95))] #cutoff, if frequency occurs under 5% or over 95% remove
preprocess_pipeline_tfidf = Pipeline(prc_steps_tfidf)

In [89]:
X_train_proc_tfidf = preprocess_pipeline_tfidf.fit_transform(X_train)

In [90]:
X_train_proc_tfidf.shape

(3714, 70)

In [91]:
#creating a dataframe with all the feature texts with tfidf vectorizer 
feat_names_tfidf = preprocess_pipeline_tfidf[
    'tfidfvec'].get_feature_names_out()

pd.DataFrame(X_train_proc_tfidf.toarray(), columns = feat_names_tfidf)

Unnamed: 0,across,animal,anything,area,around,away,back,bear,behind,bigfoot,came,could,creature,dark,day,first,foot,friend,get,going,got,hair,head,heard,know,large,left,like,long,look,looked,looking,made,mile,minute,never,next,night,one,ran,right,road,said,saw,second,see,seen,side,something,sound,started,still,stopped,tall,thing,thought,time,told,took,tree,turned,two,walked,walking,way,went,wood,would,yard,year
0,0.0,0.104901,0.000000,0.161962,0.154412,0.000000,0.068943,0.000000,0.00000,0.000000,0.097847,0.000000,0.000000,0.000000,0.000000,0.000000,0.074758,0.0,0.193978,0.222336,0.087677,0.103140,0.541992,0.000000,0.198287,0.092620,0.097174,0.197431,0.196146,0.102180,0.331738,0.000000,0.000000,0.101688,0.000000,0.000000,0.000000,0.093127,0.078282,0.000000,0.193609,0.000000,0.000000,0.000000,0.107789,0.000000,0.195018,0.091359,0.080408,0.000000,0.000000,0.107882,0.000000,0.093263,0.000000,0.00000,0.227264,0.000000,0.106867,0.000000,0.000000,0.000000,0.000000,0.000000,0.096002,0.169378,0.000000,0.086660,0.000000,0.093127
1,0.0,0.000000,0.000000,0.000000,0.000000,0.265030,0.205093,0.327906,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.222392,0.0,0.000000,0.000000,0.000000,0.306825,0.000000,0.228388,0.000000,0.000000,0.000000,0.195775,0.291751,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.271778,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.216973,0.000000,0.000000,0.000000,0.000000,0.239201,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.317913,0.000000,0.000000,0.000000,0.324021,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.277038
2,0.0,0.000000,0.120160,0.095692,0.091231,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.090691,0.123596,0.112931,0.110895,0.000000,0.176676,0.0,0.000000,0.000000,0.207209,0.121876,0.000000,0.000000,0.000000,0.000000,0.114827,0.233295,0.000000,0.120742,0.098000,0.000000,0.000000,0.000000,0.000000,0.107955,0.000000,0.000000,0.000000,0.000000,0.114390,0.304191,0.000000,0.430925,0.000000,0.462368,0.000000,0.000000,0.285043,0.000000,0.000000,0.000000,0.000000,0.220410,0.000000,0.10811,0.000000,0.122830,0.000000,0.107072,0.122426,0.108773,0.000000,0.000000,0.113442,0.000000,0.000000,0.000000,0.119250,0.000000
3,0.0,0.000000,0.130951,0.000000,0.099425,0.229458,0.000000,0.000000,0.00000,0.259817,0.000000,0.000000,0.134696,0.123074,0.120854,0.127179,0.096272,0.0,0.000000,0.000000,0.112909,0.132822,0.000000,0.494336,0.000000,0.000000,0.000000,0.169498,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.275951,0.117650,0.000000,0.359782,0.000000,0.000000,0.000000,0.110504,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.133862,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.218122,0.000000,0.111599,0.389880,0.119927
4,0.0,0.000000,0.121363,0.000000,0.000000,0.000000,0.082282,0.000000,0.26393,0.000000,0.116779,0.183198,0.000000,0.000000,0.000000,0.000000,0.089222,0.0,0.000000,0.000000,0.313925,0.000000,0.000000,0.000000,0.118327,0.110541,0.000000,0.235631,0.000000,0.000000,0.098981,0.121853,0.125670,0.000000,0.000000,0.000000,0.125775,0.555730,0.186858,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.349128,0.109036,0.191931,0.000000,0.000000,0.000000,0.132558,0.111309,0.228638,0.00000,0.090412,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.111146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3709,0.0,0.000000,0.105793,0.000000,0.000000,0.092687,0.215178,0.000000,0.00000,0.000000,0.101797,0.239544,0.000000,0.099429,0.000000,0.102746,0.388879,0.0,0.100905,0.000000,0.091217,0.000000,0.000000,0.000000,0.000000,0.289077,0.000000,0.000000,0.102032,0.000000,0.172566,0.106220,0.219095,0.000000,0.000000,0.000000,0.219278,0.000000,0.000000,0.225354,0.000000,0.000000,0.105034,0.227641,0.000000,0.162834,0.000000,0.000000,0.083654,0.200816,0.217908,0.112238,0.000000,0.194057,0.000000,0.00000,0.078813,0.000000,0.222363,0.188540,0.000000,0.191536,0.113318,0.105666,0.000000,0.088108,0.000000,0.000000,0.000000,0.000000
3710,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.346483,0.0,0.000000,0.257618,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.709798,0.000000,0.000000,0.000000,0.000000,0.244216,0.000000,0.000000,0.000000,0.000000,0.000000,0.233958,0.169020,0.000000,0.000000,0.000000,0.000000,0.186335,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.209982,0.000000,0.000000,0.000000,0.000000,0.222473,0.196256,0.000000,0.000000,0.000000,0.000000
3711,0.0,0.000000,0.000000,0.000000,0.153031,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.608499,0.207318,0.189430,0.000000,0.000000,0.000000,0.0,0.000000,0.220346,0.173786,0.204434,0.000000,0.000000,0.000000,0.000000,0.000000,0.130443,0.000000,0.000000,0.164385,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.170083,0.000000,0.144566,0.213649,0.000000,0.000000,0.000000,0.000000,0.191295,0.000000,0.213834,0.000000,0.184857,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.348413,0.171769,0.000000,0.000000
3712,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.151925,0.000000,0.24366,0.000000,0.000000,0.000000,0.230492,0.000000,0.000000,0.000000,0.329480,0.0,0.000000,0.000000,0.000000,0.227284,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.224988,0.232036,0.224084,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.213324,0.189094,0.000000,0.160725,0.000000,0.344905,0.000000,0.201323,0.000000,0.000000,0.000000,0.000000,0.000000,0.205520,0.000000,0.00000,0.000000,0.000000,0.000000,0.399354,0.000000,0.000000,0.000000,0.000000,0.211556,0.000000,0.000000,0.190968,0.000000,0.000000


## Naive Bayes Classifier

## Decision Tree Classifier

In [95]:
tree_clf = DecisionTreeClassifier(max_depth=3, max_features=10, random_state=88)
tree_clf.fit()