## Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [None]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [None]:
print(train.shape)

In [None]:
train.tail()

In [None]:
test.head()

In [None]:
train.info()


In [None]:
test.info()

In [None]:
train.sentiment.value_counts()

In [None]:
train[train.sentiment==1].head(10)

## Splitting out the X variable from the target

In [None]:
y = train['sentiment']
X = train['message']

In [None]:
print(y)


In [None]:
print(X)

In [None]:
from nltk.stem import PorterStemmer

# initiate Stemmer
porter_stemmer=PorterStemmer()

def preprocessor(text):
    
    text=text.lower()
    text=re.sub(re.sub(r'\^[a-zA-Z]s+', '', text))
    text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text)
    
    # stem words
    words=re.split("\\s+", text)
    stemmed_words=[porter_stemmer.stem(word=word) for word in words]
    return ''.join(stemmed_words)

In [None]:
def text_tokenizer(text):
    # provide space between special characters
    text=re.sub("(\\W)"," \\1", text)
    
    # split on whitespace
    return re.split("\\s+",text)

## Turning text into something your model can read

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), tokenizer=text_tokenizer, min_df=2, stop_words=["english"],max_df=0.75)
X_vectorized = vectorizer.fit_transform(X)

#stop_words="english",max_df=0.85, preprocessor=my_cool_preprocessor,"all","in","the","is","and"

In [None]:
#vectorizer.stop_words_

In [None]:
print(X_vectorized)

## Splitting the training data into a training and validation set

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.2,shuffle=True, stratify=y, random_state=25)

## Training the model and evaluating using the validation set 

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_val)


In [None]:
print(rfc_pred)

In [None]:
svm_lsvc =LinearSVC()
svm_lsvc.fit(X_train, y_train)
svm_lsvc_pred = svm_lsvc.predict(X_val)

In [None]:
print(svm_lsvc_pred)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)

## Checking the performance of our model on the validation set

In [None]:
f1_score(y_val, svm_lsvc_pred, average="macro")

In [None]:
f1_score(y_val, rfc_pred, average="macro")

In [None]:
f1_score(y_val, lr_pred, average="macro")

In [None]:
from sklearn import metrics

print(metrics.classification_report(y_val, svm_lsvc_pred))

In [None]:
print(metrics.classification_report(y_val, rfc_pred))

In [None]:
print(metrics.classification_report(y_val, lr_pred))

## Getting our test set ready 

In [None]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [None]:
y_pred = svm_lsvc.predict(test_vect)

In [None]:
print(y_pred)

In [None]:
test['sentiment'] = y_pred

In [None]:
test.head()

## Creating an output csv for submission

In [None]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)