## Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [2]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [3]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

## Data Pre-processing

import re
import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
#ps = PorterStemmer() 

# Create a function to clean the tweets
def cleanTxt(text):
    #text = re.sub('https?:\/\/\S+', '', text)                 # Removing hyperlink
    text = re.sub('@[A-Za-z0–9]+', '', text)                  #Removing @mentions
    text = re.sub('[^a-zA-Z]',' ', text)   
    text = re.sub('#', '', text)                              # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text)                        # Removing RT
    text = str(text).lower()                                  #convert all characters to lower case
    text = word_tokenize(text)                                # Tokenisation
    text = [item for item in text if item not in stop_words]  #Removing all stopwords
    text = [lemma.lemmatize(word=w,pos='v') for w in text]
    text = [i for i in text if len(i)>2]                      # removing words having length <=2 
    text = ' '.join(text)                                     #Converting list to string
    
    return text
train['CleanMessage']= train['message'].apply(cleanTxt)
train.head(10)

## Splitting out the X variable from the target

In [4]:
y = train['sentiment']
#X = train['CleanMessage']
X= train['message']

## Turning text into something your model can read

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [6]:
#X_train,X_test,y_train,y_test = train_test_split(X_vectorized,y,test_size=.30,shuffle=True, stratify=y, random_state=44)
from sklearn.model_selection import RepeatedKFold
kf = RepeatedKFold(n_splits=10, n_repeats=20, random_state=None) 

for train_index, test_index in kf.split(X_vectorized):
    #print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X_vectorized[train_index], X_vectorized[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    

In [7]:
from imblearn.over_sampling import SMOTE
smote =SMOTE()
X_smote,y_smote =smote.fit_sample(X_train,y_train)

In [8]:
from collections import Counter
print ("Before smote",Counter(y_train))
print ("after smote",Counter(y_smote))

Before smote Counter({1: 7696, 2: 3275, 0: 2104, -1: 1163})
after smote Counter({1: 7696, 2: 7696, 0: 7696, -1: 7696})


## Training the model and evaluating using the validation set 

In [9]:
rfc = RandomForestClassifier(n_estimators=200, random_state=44)
rfc.fit(X_smote, y_smote)
rfc_pred = rfc.predict(X_test)

## Checking the performance of our model on the validation set

In [10]:
f1_score(y_test, rfc_pred,average="macro")

0.6245499365100469

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#print(confusion_matrix(y_val, rfc_pred))
print(classification_report(y_test, rfc_pred))
#print(accuracy_score(y_val, rfc_pred))

              precision    recall  f1-score   support

          -1       0.85      0.25      0.38       133
           0       0.67      0.47      0.55       249
           1       0.73      0.86      0.79       834
           2       0.75      0.79      0.77       365

    accuracy                           0.73      1581
   macro avg       0.75      0.59      0.62      1581
weighted avg       0.74      0.73      0.72      1581



## Getting our test set ready 

In [12]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [13]:
y_pred = rfc.predict(test_vect)

In [14]:
test['sentiment'] = y_pred
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


## Creating an output csv for submission

In [15]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)