In [None]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import date, timedelta, datetime
warnings.filterwarnings("ignore", category=DeprecationWarning)
nltk.download('stopwords')

%matplotlib qt
# %install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
# %load_ext autotime
nltk.download('stopwords')

## LOAD DATA

In [None]:
all_tweets = pd.read_csv('dataset/tweets_all_brisbane.csv')
all_tweets.drop(columns='Unnamed: 0', inplace=True)
all_tweets['date'] = pd.to_datetime(all_tweets['date'])
all_tweets.sort_values(by='date', inplace=True, ignore_index=True)
all_tweets['date'] = all_tweets['date'].dt.strftime('%#d/%m/%Y %H:%M')
all_tweets.drop_duplicates(inplace=True, ignore_index=True)

In [None]:
all_tweets.head(5)

## PROCESS DATA

In [None]:
combi = all_tweets
combi.columns = ['date', 'tweet']

### REMOVING TWITTER MENTION (@user)

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

# remove twitter handles (@user)
combi['tidy_tweet'] = np.vectorize(remove_pattern)(combi['tweet'], "@[\w]*")

### REMOVING URLS

In [None]:
def remove_urls(df):
    df['tidy_tweet'] = df['tidy_tweet'].str.replace(r"http\S+", "")

remove_urls(combi)

### REMOVING PUNCTATION, NUMBERS AND SPECIAL CHARACTERS

In [None]:
# remove special characters, numbers, punctuations
combi['tidy_tweet'] = combi['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

### REMOVING SHORT WORDS AND LOWER CASES

In [None]:
# remove short words (length <= 3)
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# lower case
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: x.lower())

### REMOVING STOPWORDS

In [None]:
STOPWORDS = stopwords.words('english')
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda text: cleaning_stopwords(text))
combi['tidy_tweet'].head()

### TOKENIZATION

In [None]:
tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

### STEMMING

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

combi['tidy_tweet'] = tokenized_tweet
combi.columns = ['date', 'tweet', 'tidy_tweet']

key_words = ['covid', 'locldown', 'cold', 'rain', 'hot', 'congestion', 'traffic', 'road']

In [None]:
new_df = pd.DataFrame()

for key_word in key_words:
    temp = combi.loc[combi['tidy_tweet'].str.contains(key_word), :]
    new_df = new_df.append(temp)

In [None]:
new_df.reset_index(inplace=True, drop=True)

In [None]:
new_df.style.set_properties(subset=['tweet'], **{'width': '1000px'}, inplace=True)
new_df[['date', 'tweet']].head(5)

In [None]:
from pandas import option_context

with option_context('display.max_colwidth', 300):
    display(new_df[['date', 'tweet']].head(5).head())

In [None]:
text = " ".join(cat for cat in new_df.tidy_tweet)

In [None]:
from wordcloud import WordCloud
from PIL import Image

mask = np.array(Image.open('aus.png'))
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='white', colormap='tab10', collocations=False, stopwords = STOPWORDS, mask=mask).generate(text)

In [None]:
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(10, 10))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off")

In [None]:
plot_cloud(wordcloud)

## SENTIMENT CLASSIFICATION

### LOAD MODEL

In [None]:
from joblib import load
model = load('sentiment-SVM-model.joblib')

In [None]:
model.get_params()

### PREPARE DATA

In [None]:
selected_df = combi
X = selected_df.loc[:, 'tidy_tweet']

In [None]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X = vectoriser.fit_transform(X)
X = X.toarray()

In [None]:
y_pred = model.predict(X)

In [None]:
result = pd.DataFrame()
result['date'] = combi['date']
result['sentiment'] = pd.DataFrame(y_pred)

In [None]:
result.to_csv('dataset/tweets_all_brisbane_sentiment_done.csv', index=False)

## COUNTING POS/NEG TWEETS

### LOAD DATA

In [None]:
sentimented_tweets = pd.read_csv('dataset/tweets_all_brisbane_sentiment_done.csv')

### CREATE DATE RANGE

In [None]:
sdate = pd.to_datetime('2021-02-21')
edate = pd.to_datetime('2021-06-19')
all_dates = pd.date_range(sdate,edate-timedelta(days=1),freq='D')
all_dates = all_dates.strftime('%#d/%m/%Y')

In [None]:
def datetime_range(start, end, delta):
    current = start
    while current < end:
        yield current
        current += delta

min_intervals = [dt.strftime('%H:%M') for dt in 
       datetime_range(datetime(2021, 2, 20, 23,59), datetime(2021, 2, 22, 0,0,0), 
       timedelta(minutes=5))]

min_intervals = min_intervals[1:]

### COUNTING

In [None]:
count_tweets_df = pd.DataFrame()
current_time = '20/02/2021 23:59'
for date in all_dates:
    all_daily_tweets = sentimented_tweets.loc[sentimented_tweets['date'].str.contains(date), :]
    for intervel in min_intervals:
        tweets_in_interval = all_daily_tweets.loc[(pd.to_datetime(all_daily_tweets['date'], format='%d/%m/%Y %H:%M') > pd.to_datetime(f'{current_time}', format='%d/%m/%Y %H:%M')) & (pd.to_datetime(all_daily_tweets['date'], format='%d/%m/%Y %H:%M') <= pd.to_datetime(f'{date} {intervel}', format='%d/%m/%Y %H:%M')), :]
        neg = len(tweets_in_interval.loc[tweets_in_interval['sentiment'] == 0, :])
        pos = len(tweets_in_interval.loc[tweets_in_interval['sentiment'] == 1, :])
        count_tweets_df = count_tweets_df.append(pd.DataFrame([f"{date} {intervel}", neg, pos]).T)
        current_time = f"{date} {intervel}"

In [None]:
count_tweets_df.columns = ['date', 'num_neg', 'num_pos']

count_tweets_df.reset_index(inplace=True, drop=True)

count_tweets_df.to_csv('dataset/count_sentiment_tweets.csv', index=False)

## MERGE DATASET

In [None]:
tweets_data = pd.read_csv('dataset/count_sentiment_tweets.csv')
traffic_data = pd.read_csv('dataset/acceptable_traffic_data_processed.csv')

traffic_data.sort_values(by='recorded', inplace=True, ignore_index=True)

traffic_data['recorded'] = pd.to_datetime(traffic_data['recorded']).apply(lambda x: x.strftime('%#d/%m/%Y %H:%M'))
traffic_data[['num_neg', 'num_pos']] = 0

In [None]:
traffic_data.head()

In [None]:
all_dates = traffic_data['recorded'].to_numpy()
satisfied_df = tweets_data[tweets_data['date'].isin(all_dates)]
satisfied_df.reset_index(inplace=True, drop=True)

In [None]:
satisfied_df.head()

In [None]:
traffic_data.loc[:, ['num_neg', 'num_pos']] = satisfied_df[['num_neg', 'num_pos']]