In [51]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
pd.set_option('display.max_colwidth', None)

In [52]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)

(7613, 5)
(3263, 4)


# First approach 

In [53]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [54]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [55]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [56]:
count_vectorizer.transform(test_df["text"]).todense().shape

(3263, 21637)

In [59]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [60]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.59453669, 0.56455572, 0.64082434])

In [29]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

In [30]:
sample_submission = pd.read_csv("sample_submission.csv")


In [31]:
sample_submission["target"] = clf.predict(test_vectors)

In [32]:
sample_submission["target"]

0       0
1       1
2       1
3       0
4       1
       ..
3258    1
3259    1
3260    1
3261    1
3262    0
Name: target, Length: 3263, dtype: int64

In [37]:
sample_submission.to_csv("sample_submission_1.csv",index=False)

In [38]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [39]:
test = pd.read_csv("sample_submission_1.csv")

# Second approach (using Spacy)

In [1]:
#We download the language models. In this case English and lg (big one)
! python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl (400.7 MB)
     -------------------------------------- 400.7/400.7 MB 6.3 MB/s eta 0:00:00
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')




In [2]:
import spacy
from spacy import displacy

In [3]:
# We load the model
nlp_en = spacy.load("en_core_web_lg") 

In [5]:
doc = nlp_en("Hello, How are you?, we are 6")

In [17]:
for token in doc:
    # Imprime en pantalla el texto y el part-of-speech tag predicho
    print(token.text, token.pos_)

Hello INTJ
, PUNCT
How SCONJ
are AUX
you PRON
? PUNCT
, PUNCT
we PRON
are AUX
6 NUM


In [22]:
train_df['ent']=train_df['text'].apply(lambda x: nlp_en(x).text)
train_df['label']=train_df['text'].apply(lambda x: nlp_en(x).label_)

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'label_'

In [23]:
train_df

Unnamed: 0,id,keyword,location,text,target,ent
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,"13,000 people receive #wildfires evacuation orders in California"
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.


In [28]:
pan_hash = nlp_en.vocab.strings["bread"]

nlp_en.vocab.strings[pan_hash]

'bread'

In [26]:
pan_hash

12180372428151321402

In [32]:
print("hash value:", doc.vocab.strings["coffee"])

hash value: 3197928453018144401


In [33]:
print(nlp_en.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
