In [2]:
# when installing the libraries after cloning the repository, use this command: pip install -r requirements.txt
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

import setuptools.dist

from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, LSTM
from keras.utils import pad_sequences

In [3]:
train_data = pd.read_csv("train.csv")

In [4]:
test_data = pd.read_csv("test.csv")

In [5]:
import re
import unidecode

def clean_text(text):
    if pd.isna(text):
        return ""
    # Odstránenie URL
    text = re.sub(r'http\S+', '', text)
    # Odstránenie hashtagov
    text = re.sub(r'#\w+', '', text)
    # Odstránenie viacnásobných otáznikov a výkričníkov
    text = re.sub(r'(\?{2,}|\!{2,})', '', text)
    # Odstránenie špeciálnych znakov (ponechanie len alfanumerických znakov a medzier)
    text = re.sub(r'[^\w\s]', '', text)
    # Normalizácia textu - odstránenie diakritiky
    text = unidecode.unidecode(text)
    # Previesť text na malé písmená
    text = text.lower()
    return text

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def clean_data(df):
  
    clean_df = df.copy()
    clean_df.drop(columns=['location'], inplace=True)
    clean_df['keyword'].fillna('unknown', inplace=True)
    clean_df['keyword'] = clean_df['keyword'].apply(clean_text)
    clean_df['keyword'] = clean_df['keyword'].apply(remove_numbers)
    clean_df['text'] = clean_df['text'].apply(clean_text)

    return clean_df


In [6]:
new_train_data = clean_data(train_data)

new_train_data.head(20)

Unnamed: 0,id,keyword,text,target
0,1,unknown,our deeds are the reason of this may allah fo...,1
1,4,unknown,forest fire near la ronge sask canada,1
2,5,unknown,all residents asked to shelter in place are be...,1
3,6,unknown,13000 people receive evacuation orders in cal...,1
4,7,unknown,just got sent this photo from ruby as smoke f...,1
5,8,unknown,update california hwy 20 closed in both dire...,1
6,10,unknown,heavy rain causes flash flooding of streets ...,1
7,13,unknown,im on top of the hill and i can see a fire in ...,1
8,14,unknown,theres an emergency evacuation happening now i...,1
9,15,unknown,im afraid that the tornado is coming to our area,1


In [7]:
import re
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
def add_hashtags_column(df):
    new_df = df.copy()
    # Extract words that start with '#' and remove the '#' symbol and remove also special characters and numbers
    new_df['hashtags'] = new_df['text'].apply(
        lambda x: [re.sub(r'[^a-zA-Z]', '', re.sub(r'^#+', '', word)) for word in x.split() if word.startswith('#')]
    )
    return new_df
def get_sentiment(text):
    # Initialize VADER sentiment intensity analyzer
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)
    if sentiment_score['compound'] >= 0.05:
        return 'Positive'
    elif sentiment_score['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lenocka/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
new_clean_data = add_hashtags_column(train_data)
new_clean_data = clean_data(new_clean_data)
new_clean_data['sentiment'] = new_clean_data['text'].apply(get_sentiment)

In [9]:
new_clean_data.head()

Unnamed: 0,id,keyword,text,target,hashtags,sentiment
0,1,unknown,our deeds are the reason of this may allah fo...,1,[earthquake],Positive
1,4,unknown,forest fire near la ronge sask canada,1,[],Negative
2,5,unknown,all residents asked to shelter in place are be...,1,[],Negative
3,6,unknown,13000 people receive evacuation orders in cal...,1,[wildfires],Neutral
4,7,unknown,just got sent this photo from ruby as smoke f...,1,"[Alaska, wildfires]",Neutral


In [10]:
import numpy as np
import spacy

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')  # Or 'en_core_web_lg' for larger vectors

def preprocess_and_vec(text):
    # Process the text with SpaCy
    doc = nlp(text)
    
    # Filter out stop words and punctuation, and collect lemmatized tokens
    filtered_text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    # Check if there are no valid tokens
    if not filtered_text:
        print('empty', doc)
        return None
    
    # Retrieve vectors for each token in filtered_text
    vectors = [token.vector for token in doc if token.lemma_ in filtered_text]
    
    if vectors:
        # Calculate the mean vector of the words
        mean_vector = np.mean(vectors, axis=0)
        return mean_vector
    else:
        return None

In [11]:
new_clean_data['vector'] = new_clean_data['text'].apply(preprocess_and_vec)

In [12]:
from sklearn.preprocessing import LabelEncoder

def label_encoding(df):
    # Initialize the LabelEncoder
    label_encoder = LabelEncoder()
    new_df = df.copy()

    # Fit and transform the 'sentiment' column
    new_df['sentiment'] = label_encoder.fit_transform(new_df['sentiment'])
    new_df['keyword'] = label_encoder.fit_transform(new_df['keyword'])

    return new_df

In [13]:
# new_clean_data = label_encoding(new_clean_data)

In [23]:
#X = new_clean_data[['vector', 'keyword', 'sentiment']]
X = new_clean_data['vector']
y = new_clean_data['target']

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=y
)

In [16]:
len(X_train)

6090

In [17]:
len(y_train)

6090

In [26]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

clf = RandomForestClassifier()

# Proceed with model fitting

clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.81      0.75       869
           1       0.67      0.51      0.58       654

    accuracy                           0.68      1523
   macro avg       0.68      0.66      0.66      1523
weighted avg       0.68      0.68      0.68      1523

