In [3]:
import pandas as pd
import numpy as np
import io
import os
import re
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
path = '/content/drive/MyDrive/colab_data'
def de_emojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')
def tweet_proc(df, text_col='text'):
    df['orig_text'] = df[text_col]
    # Remove twitter handles
    df[text_col] = df[text_col].apply(lambda x:re.sub('@[^\s]+','',x))
    # Remove URLs
    df[text_col] = df[text_col].apply(lambda x:re.sub(r"http\S+", "", x))
    # Remove emojis
    df[text_col] = df[text_col].apply(de_emojify)
    # Remove hashtags
    df[text_col] = df[text_col].apply(lambda x:re.sub(r'\B#\S+','',x))
    return df[df[text_col]!='']


In [5]:
covid_tweet = pd.read_csv(os.path.join(path, "Covid-19 Twitter Dataset (Aug-Sep 2020).csv"))
covid_tweet = covid_tweet[covid_tweet.original_text.isnull()==False].drop_duplicates().reset_index(drop=True)
covid_tweet = tweet_proc(covid_tweet,'original_text')
covid_tweet['label'] = np.nan
covid_tweet = covid_tweet[covid_tweet.lang=='en']
covid_tweet = covid_tweet[['id', 'original_text', 'sentiment', 'label']].rename(columns={'original_text':'text'})
covid_tweet.head(3)

Unnamed: 0,id,text,sentiment,label
0,1.3e+18,RT 91-year-old Ex-Vice President Moody Awori Lands Inter County Covid-19 Committee Role,neu,
1,1.3e+18,"RT BREAKING: The Department of Health reports 4,339 more people caught COVID-19, pushing the national case count to 178,02",neu,
2,1.3e+18,RT Helps Out Fan Who Requested Him To Help Arrange A Bed For Her Positive Father (View Tweet)\n,pos,


In [6]:
basic_tweet = pd.read_csv(os.path.join(path, "sentiment140_twitter.csv"), names=['target', 'id', 'date', 'flag', 'user', 'text'], header=None,encoding = "ISO-8859-1")
basic_tweet = basic_tweet[basic_tweet.text.isnull()==False].drop_duplicates().reset_index(drop=True)
basic_tweet = tweet_proc(basic_tweet,'text')
basic_tweet['label'] = np.where(basic_tweet['target']==0, 0, 1)
basic_tweet['sentiment'] = np.where(basic_tweet['target']==0, 'neg', 'pos')
basic_tweet = basic_tweet[['id', 'text', 'sentiment', 'label']]
basic_tweet.head(3)

Unnamed: 0,id,text,sentiment,label
0,1467810369,"- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",neg,0
1,1467810672,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,neg,0
2,1467810917,I dived many times for the ball. Managed to save 50% The rest go out of bounds,neg,0


In [7]:
df_lm = basic_tweet.append(covid_tweet)
df_clas = df_lm[['text', 'label']].dropna(subset=['label'])
print(len(df_lm), len(df_clas))
df_clas.head(3)

1842736 1600000


Unnamed: 0,text,label
0,"- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0.0
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0.0
2,I dived many times for the ball. Managed to save 50% The rest go out of bounds,0.0


# DL & Transfer Learning with fastai
some reference:

https://www.kaggle.com/maroberti/fastai-with-transformers-bert-roberta

https://www.kaggle.com/twhelan/covid-19-vaccine-sentiment-analysis-with-fastai

In [8]:
from fastai.text.all import *
# to solve version problem: !pip install fastcore==1.0.0

Here we told fastai that we are working with text data, which is contained in the text column of a pandas DataFrame called df_lm. We set is_lm=True since we want to train a language model, so fastai needs to label the input data for us. Finally, we told fastai to hold out a random 10% of our data for a validation set using valid_pct=0.1

In [None]:
dls_lm = TextDataLoaders.from_df(df_lm, text_col='text', is_lm=True, valid_pct=0.1)

