# Train Data

In [1]:
import pandas as pd
import numpy as np

import re
from textblob import TextBlob
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.90,
    ngram_range=(1,3), 
    min_df=2, 
    max_features=1500, 
    stop_words='english'
    )
rf=RandomForestClassifier(
    n_estimators=200,
    criterion='gini'
    )

data=pd.read_csv('Data/clean_data.csv')

tfidf_data=tfidf_vectorizer.fit_transform(data['cleanText'].apply(lambda x: np.str_(x)))

X_train,X_test,y_train,y_test=train_test_split(tfidf_data,data['sentiment'],test_size=0.3,random_state=0)

rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)

print(f1_score(y_test,y_pred,average='weighted'))

0.6441945970995222


## fun for cleaning test data

In [2]:
lemmatizer = WordNetLemmatizer()

#Create a function to clean tweets
def cleanText(text):
    text=str(text)  #Coverts Text to String
    text=re.sub(r'@[A-Za-z0-9]+','',text)  #Removing @Mentions
    text = re.sub(r'#[\w]*sxsw[\w]*', ' ', text,flags=re.I)  #Removing sxsw hashtag
    text=re.sub(r'#','',text)  #Removing # Symbols
    text=re.sub(r'RT[\s]+','',text)  #Removing ReTweets
    text=re.sub(r'https?:\/\/\s+','',text)  #Removing the hyperlinks
    text=re.sub(r'bit.ly[/\.\w]+','',text)  #Removing the shortlinks 
    text=text.replace(r'{html}',"") 
    cleanr = re.compile(r'<.*?>')
    text = re.sub(cleanr, '', text)
    text = re.sub(r'[0-9]+', '', text)  #Removing Numbers
    text = re.sub(r'[^A-Za-z]+', ' ', text)  #Removing all spacial character
    text = text.lower()  #Coverts Text To Lower Case
    
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text) 
    filtered_words = [w for w in tokens if w not in stopwords.words('english')]
    lemma_words=[lemmatizer.lemmatize(w) for w in filtered_words]
    return " ".join(lemma_words)

# run on test data

In [3]:
data=pd.read_csv('Data/test.csv')
data['cleanText']=data['tweet'].map(lambda s:cleanText(s))
data_tfidf=tfidf_vectorizer.transform(data['cleanText'])
y_pred=rf.predict(data_tfidf)
output=pd.concat([data['tweet_id'],pd.DataFrame(y_pred,columns=['sentiment'])],axis=1)
output.to_csv('Data/submission_file.csv',index=False)