# Fast Text

In [94]:
# read in kaggle's training data
import pandas as pd
df = pd.read_csv('Data/train.csv')

In [95]:
# check shape and head
print(df.shape)
df.head()

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [96]:
# format target labels for fasttext:
# must have "__label__" before each target lable
df['target'] = "__label__" + df['target'].astype(str)

# check
df.target.unique()

array(['__label__1', '__label__0'], dtype=object)

In [97]:
# merge target and text columns so the text is on the same line as the label
# with the label first (separated by a space); this is required for 
# fasttext formatting
df['target_text'] = df['target'] + " " + df['text']

# check
df.target_text.head()

0    __label__1 Our Deeds are the Reason of this #e...
1    __label__1 Forest fire near La Ronge Sask. Canada
2    __label__1 All residents asked to 'shelter in ...
3    __label__1 13,000 people receive #wildfires ev...
4    __label__1 Just got sent this photo from Ruby ...
Name: target_text, dtype: object

In [98]:
# define preprocess function

import re

# preprocess function subs everything that's not a word character or 
# is a space character or is an apostrophe with a space, then subs all
# multiple spaces with just one space, then strips leading and lagging spaces
# and converts all letters to lowercase, then subs newline characters with a
# single space and subs all multiple spaces with just one space again
def preprocess(text):
    text = re.sub(r"[^\w\s\']", " ", text)
    text = re.sub(r" +", " ", text)
    text = text.strip().lower()
    text = re.sub(r"\\n+", " ", text)
    text = re.sub(r" +", " ", text)
    return text.strip()

# apply preprocess function to target_text column
df['target_text'] = df['target_text'].map(preprocess)

# check
df.target_text.head()

0    __label__1 our deeds are the reason of this ea...
1     __label__1 forest fire near la ronge sask canada
2    __label__1 all residents asked to 'shelter in ...
3    __label__1 13 000 people receive wildfires eva...
4    __label__1 just got sent this photo from ruby ...
Name: target_text, dtype: object

In [99]:
df.target_text[1]
# why does it look like there's a space in front of this tweet in the
# output above?

'__label__1 forest fire near la ronge sask canada'

In [100]:
for num in range(0, 50):
    print(df.target_text[num])

__label__1 our deeds are the reason of this earthquake may allah forgive us all
__label__1 forest fire near la ronge sask canada
__label__1 all residents asked to 'shelter in place' are being notified by officers no other evacuation or shelter in place orders are expected
__label__1 13 000 people receive wildfires evacuation orders in california
__label__1 just got sent this photo from ruby alaska as smoke from wildfires pours into a school
__label__1 rockyfire update california hwy 20 closed in both directions due to lake county fire cafire wildfires
__label__1 flood disaster heavy rain causes flash flooding of streets in manitou colorado springs areas
__label__1 i'm on top of the hill and i can see a fire in the woods
__label__1 there's an emergency evacuation happening now in the building across the street
__label__1 i'm afraid that the tornado is coming to our area
__label__1 three people died from the heat wave so far
__label__1 haha south tampa is getting flooded hah wait a secon

In [101]:
# make preprocessed text column (without label) to test model on
df['processed_text'] = df['text'].map(preprocess)

# check
df.processed_text.head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to 'shelter in place' are ...
3    13 000 people receive wildfires evacuation ord...
4    just got sent this photo from ruby alaska as s...
Name: processed_text, dtype: object

In [102]:
# tts

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, 
                               test_size = 0.2, 
                               random_state = 2022,
                               stratify = df.target)

# check
print(train.shape)
print(test.shape)

(6090, 7)
(1523, 7)


In [103]:
# save formatted and labeled column in file for fasttext to train on

import csv

train.to_csv("disaster_train_fasttext.csv", 
             columns = ["target_text"], 
             index = False, 
             header = False,
             quoting = csv.QUOTE_NONNUMERIC)

In [104]:
!pip install fasttext



In [105]:
import fasttext

In [106]:
# train model with labels
model = fasttext.train_supervised(input = 'disaster_train_fasttext.csv',
                                 epoch = 10,
                                 ws = 5,
                                 dim = 50)

In [107]:
df = pd.read_csv('disaster_train_fasttext.csv')

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6089 entries, 0 to 6088
Data columns (total 1 columns):
 #   Column                                                                                                                                         Non-Null Count  Dtype 
---  ------                                                                                                                                         --------------  ----- 
 0   __label__0 reddit will now quarantine offensive content reddit co founder and ceo steve huffman has unveiled more specif http t co ljmgdpdlvs  6089 non-null   object
dtypes: object(1)
memory usage: 47.7+ KB


In [109]:
?model

In [None]:
# check model
model.predict("hello how are you")

# THIS KILLS MY KERNEL EVERY TIME

In [None]:
# read in training data
import pandas as pd
df = pd.read_csv('Data/train.csv')

# format target labels for fasttext:
# must have "__label__" before each target lable
df['target'] = "__label__" + df['target'].astype(str)

# merge target and text columns so the text is on the same line as the label
# with the label first (separated by a space); this is required for 
# fasttext formatting
df['target_text'] = df['target'] + " " + df['text']

# preprocess function subs everything that's not a word character or 
# is a space character or is an apostrophe with a space, then subs all
# multiple spaces with just one space, then strips leading and lagging spaces
# and converts all letters to lowercase, then subs newline characters with a
# single space and subs all multiple spaces with just one space again
import re
def preprocess(text):
    text = re.sub(r"[^\w\s\']", " ", text)
    text = re.sub(r" +", " ", text)
    text = text.strip().lower()
    text = re.sub(r"\\n+", " ", text)
    text = re.sub(r" +", " ", text)
    return text.strip()

# apply preprocess function to target_text column
df['target_text'] = df['target_text'].map(preprocess)

# make preprocessed text column (without label) to test model on
df['processed_text'] = df['text'].map(preprocess)

# tts
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, 
                               test_size = 0.2, 
                               random_state = 2022,
                               stratify = df.target)

# save formatted and labeled column in file for fasttext to train on
import csv
train.to_csv("disaster_train_fasttext.csv", 
             columns = ["target_text"], 
             index = False, 
             header = False,
             quoting = csv.QUOTE_NONNUMERIC)

# install and import fasttext
!pip install fasttext
import fasttext

# train model with labels
model = fasttext.train_supervised(input = 'disaster_train_fasttext.csv')

# check model
model.predict("hello how are you")

In [None]:
print(len(test.processed_text))

In [None]:
# make file with processed text (without labels) in test set
# to test model on

test.to_csv("disaster_test_fasttext.csv", 
             columns = ["processed_text"], 
             index = False, 
             header = False,
             quoting = csv.QUOTE_NONNUMERIC)

In [None]:
print(len(test.target))

In [None]:
# make file with true labels for test set to compare model predictions to

test.to_csv("disaster_test_fasttext_true_labels.csv",
           columns = ['target'],
           index = False,
           header = False,
           quoting = csv.QUOTE_NONNUMERIC)

In [None]:
# load test data
# with open('disaster_test_fasttext.csv', 'r', encoding = 'utf-8') as f:
#     test_data = f.readlines()

test_data = pd.read_csv("disaster_test_fasttext.csv", 
                        header = None,
                        quoting = csv.QUOTE_NONNUMERIC)

print(len(test_data))
    
# check
print(test_data[:10])

In [None]:
# check
test_data[0][0]

In [None]:
# remove newline characters
# test_data = [line.strip() for line in test_data]

# check
# print(test_data[:10])

In [None]:
# check length of test data
print(len(test_data))

In [None]:
# convert test data to list
test_data_list = test_data[0].tolist()

# check
test_data_list[:10]

In [None]:
true_labels = pd.read_csv("disaster_test_fasttext_true_labels.csv", 
                        header = None,
                        quoting = csv.QUOTE_NONNUMERIC)

print(len(true_labels))
    
# check
print(true_labels[:10])

In [None]:
# check length of true_labels
print(len(true_labels))

In [None]:
# convert true labels to list
true_labels_list = true_labels[0].tolist()

# check
true_labels_list[:10]

In [None]:
# this line of code kills the kernel

# predict labels for test data
# model.predict(test_data[0][0])
# predictions = [model.predict(line)[0] for line in test_data.iterrows()]

# check
# print(predictions[:10])

In [None]:
# check length of predictions
# print(len(predictions))

In [None]:
test_data_list[0]

In [None]:
# this is causing my kernel to crash
# https://fasttext.cc/docs/en/supervised-tutorial.html

model.predict("which baking dish is best to bake a banana bread")

In [None]:
# test_prediction = model.predict(test_data_list[0])
# test_prediction

In [None]:
# # get the f1 score

# from sklearn.metrics import f1_score

# f1 = f1_score(true_labels, predictions, average = 'macro')

In [None]:


# model.test("disaster_test_fasttext")

# # three output numbers: size of test samples, precision, recall

In [None]:
# how to get f1?
# model.test_label("disaster_test_fasttext")

In [None]:
# y_pred = model.predict("disaster_test_fasttext")

# y_pred

In [None]:
# from sklearn.metrics import classification_report

# classification_report()