## Import the necessary libraries

In [18]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.svm import SVC

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [19]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [20]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [None]:
train.head()

In [21]:
blanks = []  

for i,lb,rv, usr in train.itertuples():  
    if type(rv)==str:            
        if rv.isspace():         
            blanks.append(i)     
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


In [22]:
test.isnull().sum()

message    0
tweetid    0
dtype: int64

In [23]:
## Remove urls

pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
train['message'] = train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [24]:
## Remove puncuations
def remove_punctuation_numbers(message):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in message if l not in punc_numbers])
train['message'] = train['message'].apply(remove_punctuation_numbers)

Resampling

In [25]:
from sklearn.utils import resample
believe = train[train['sentiment'] == 1]
anti_belief = train[train['sentiment'] == -1]
neutral = train[train['sentiment'] == 0]
followers = train[train['sentiment'] == 2]


In [26]:
anti_belief_upsampled = resample(anti_belief, replace = True, n_samples = len(followers), random_state = 42)
believe_downsampled = resample(believe, replace = True, n_samples = len(followers), random_state = 42)
neutral_upsampled = resample(neutral, replace = True, n_samples = len(followers), random_state = 42)

In [27]:
resampled = pd.concat([followers, anti_belief_upsampled, believe_downsampled, neutral_upsampled])

In [28]:
resampled.sentiment.value_counts()

-1    3640
 2    3640
 1    3640
 0    3640
Name: sentiment, dtype: int64

## Splitting out the X variable from the target

In [29]:
y = resampled['sentiment']
X = resampled['message']

## Turning text into something your model can read

In [30]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [31]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.10,shuffle=True, stratify=y, random_state=42)

## Training the model and evaluating using the validation set 

In [40]:
clf = SVC( kernel ='rbf', C= 5)
clf.fit(X_train, y_train)
clf_pred = clf.predict(X_val)

## Checking the performance of our model on the validation set

In [41]:
f1_score(y_val, clf_pred, average="macro")

0.8797794377606026

In [None]:
print(accuracy_score(y_val,clf_pred))

## Getting our test set ready 

In [None]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [None]:
y_pred = clf.predict(test_vect)

In [None]:
test['sentiment'] = y_pred

In [None]:
test.head()

## Creating an output csv for submission

In [None]:
test[['tweetid','sentiment']].to_csv('finalsubmission.csv', index=False)