In [22]:
#import dependencies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#set train and test data to df
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [4]:
#Show first training data tweet that is not an emergency
train_df[train_df["target"] == 0]["text"].values[0]

"What's up man?"

In [6]:
#Show first training data tweet that is an emergency
train_df[train_df["target"] == 1]["text"].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [7]:
#show the train dataframe
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [13]:
train_df.shape

(7613, 5)

In [11]:
#show the test dataframe
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [12]:
test_df.shape

(3263, 4)

In [59]:
vectorizer = CountVectorizer(strip_accents= 'unicode', stop_words='english')

In [60]:
vectorizer

CountVectorizer(stop_words='english', strip_accents='unicode')

In [61]:
six_line_test = train_df["text"].values[0:6]

In [62]:
x = vectorizer.fit_transform(six_line_test)

In [63]:
x

<6x44 sparse matrix of type '<class 'numpy.int64'>'
	with 49 stored elements in Compressed Sparse Row format>

In [64]:
analyze = vectorizer.build_analyzer()

In [65]:
analyze

functools.partial(<function _analyze at 0x7fa761841dc0>, ngrams=<bound method _VectorizerMixin._word_ngrams of CountVectorizer(stop_words='english', strip_accents='unicode')>, tokenizer=<built-in method findall of re.Pattern object at 0x7fa75045b370>, preprocessor=functools.partial(<function _preprocess at 0x7fa761841ca0>, accent_function=<function strip_accents_unicode at 0x7fa761841e50>, lower=True), decoder=<bound method _VectorizerMixin.decode of CountVectorizer(stop_words='english', strip_accents='unicode')>, stop_words=frozenset({'nor', 'will', 'they', 'detail', 'around', 'get', 'through', 'ours', 'indeed', 'were', 'across', 'than', 'at', 'up', 'himself', 'into', 'with', 'must', 'ten', 'almost', 'themselves', 'least', 'this', 'a', 'nobody', 'your', 'each', 'made', 'well', 'same', 'by', 'couldnt', 'eg', 'formerly', 'even', 'thin', 'ever', 'seem', 'anyone', 'bill', 'system', 'who', 'amount', 'became', 'however', 'out', 'namely', 'move', 'whatever', 'about', 'never', 'whence', 'fift

In [66]:
vectorizer.get_feature_names()

['000',
 '13',
 '20',
 'alaska',
 'allah',
 'asked',
 'cafire',
 'california',
 'canada',
 'closed',
 'county',
 'deeds',
 'directions',
 'earthquake',
 'evacuation',
 'expected',
 'forest',
 'forgive',
 'got',
 'hwy',
 'just',
 'la',
 'lake',
 'near',
 'notified',
 'officers',
 'orders',
 'people',
 'photo',
 'place',
 'pours',
 'reason',
 'receive',
 'residents',
 'rockyfire',
 'ronge',
 'ruby',
 'sask',
 'school',
 'sent',
 'shelter',
 'smoke',
 'update',
 'wildfires']

In [67]:
array = x.toarray()

In [68]:
six_line_test_df = train_df[0:6]

In [69]:
six_line_test_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [74]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                     token_pattern=r'\b\w+\b', min_df=1, strip_accents= 'unicode', stop_words='english')
analyze = bigram_vectorizer.build_analyzer()

In [75]:
analyze

functools.partial(<function _analyze at 0x7fa761841dc0>, ngrams=<bound method _VectorizerMixin._word_ngrams of CountVectorizer(ngram_range=(1, 2), stop_words='english',
                strip_accents='unicode', token_pattern='\\b\\w+\\b')>, tokenizer=<built-in method findall of re.Pattern object at 0x7fa7617fc2f0>, preprocessor=functools.partial(<function _preprocess at 0x7fa761841ca0>, accent_function=<function strip_accents_unicode at 0x7fa761841e50>, lower=True), decoder=<bound method _VectorizerMixin.decode of CountVectorizer(ngram_range=(1, 2), stop_words='english',
                strip_accents='unicode', token_pattern='\\b\\w+\\b')>, stop_words=frozenset({'nor', 'will', 'they', 'detail', 'around', 'get', 'through', 'ours', 'indeed', 'were', 'across', 'than', 'at', 'up', 'himself', 'into', 'with', 'must', 'ten', 'almost', 'themselves', 'least', 'this', 'a', 'nobody', 'your', 'each', 'made', 'well', 'same', 'by', 'couldnt', 'eg', 'formerly', 'even', 'thin', 'ever', 'seem', 'anyone'

In [76]:
X_2 = bigram_vectorizer.fit_transform(six_line_test).toarray()
X_2

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 

In [114]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(encoding='utf-8', decode_error='strict', strip_accents='unicode')
train_vector = vectorizer.fit_transform(train_df["text"])

In [115]:
test_vector = vectorizer.transform(test_df["text"])

In [116]:
clf = linear_model.RidgeClassifier()

In [118]:
scores = model_selection.cross_val_score(clf, train_vector, train_df["target"], cv=5, scoring="f1")
scores

array([0.63148148, 0.55459272, 0.64457332, 0.59444444, 0.72235481])

In [89]:
clf.fit(train_vector, train_df["target"])

RidgeClassifier()

In [90]:
sample_submission = pd.read_csv("data/sample_submission.csv")

In [92]:
sample_submission["target"] = clf.predict(test_vector)

In [93]:
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
