In [None]:
#Import librairies
import re
import pickle
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import nltk
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
#Dataset found in Kaggle.com
data = pd.read_csv('dataset/stock_market_comments.csv')
print(data.head())

In [None]:
#Change -1 to 0 (all negative sentiments)
data.Sentiment = data.Sentiment.replace(-1,0)

In [None]:
#Bar plot 
x = ['Negative', 'Positive']
y = [data[data['Sentiment'] == 0].shape[0], data[data['Sentiment'] == 1].shape[0]]
plt.bar(x,y)
plt.show()

In [None]:
def preprocess_text(df):
        """Preprocess comments with different techniques to transform them 
        
        into a more predictable form for the model.

        Parameters
        ----------
        df : Dataframe 
             The comments to be preprocessed.
        
        Returns
        -------
        df : dataframe with new cleaned text column
        """
        clean_text_list = []
        stemmer = PorterStemmer() 
        lemmatizer = WordNetLemmatizer()
        tokenizer = RegexpTokenizer(r'\w+')
        stopwords_list = stopwords.words('english')
        

        for i in range(len(df.axes[0])):

            # Lowercasing, removing digits and non alphabetic characters
            text = str(df['Text'][i]).lower().replace('{html}',"") 
            cleanr = re.compile('<.*?>')
            clean_text = re.sub(cleanr, '', text)
            rem_url = re.sub(r'http\S+', '', clean_text)
            rem_num = re.sub('[0-9]+', '', rem_url)

            #Tokenization
            tokens = tokenizer.tokenize(rem_num)  

            #Removing stop words
            filtered_words = [w for w in tokens if not w in stopwords_list]

            #Stemming
            stem_words=[stemmer.stem(w) for w in filtered_words]

            #Lemming
            lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]

            clean_text = " ".join(lemma_words)
            clean_text_list.append(clean_text)            
        
        df['cleaned text'] = clean_text_list

        return df

In [None]:
def vectorize(df):
        """Convert cleaned comments to a matrix of TF-IDF numerical features.

        Parameters
        ----------
        df : Dataframe 
             The comments to be vectorized.
        
        Returns
        -------
        vectorized_text : 2D list of features to feed to model
        """
        vectorizer = TfidfVectorizer()
        pca = PCA(n_components=150) 
        vectorized_text = vectorizer.fit_transform(df['cleaned text']).toarray()
        vectorized_text = pca.fit_transform(vectorized_text) # fits columns to 150
        
        #Save vectorizer
        pkl_filename = "Tfidf_Vectorizer.pkl"
        with open(pkl_filename, 'wb') as file:
            pickle.dump(vectorizer, file)
        
        #Save pca
        pkl_filename = "pca.pkl"
        with open(pkl_filename, 'wb') as file:
            pickle.dump(pca, file)
        
        return vectorized_text

In [None]:
#Preprocess data
data = preprocess_text(data)
x = vectorize(data)
y = data["Sentiment"]

In [None]:
#Split into train/test data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3, stratify=y,random_state=42)

In [None]:
#Train model and predict test set
model_SVC = SVC(kernel='linear', probability=True)
model_SVC.fit(x_train, y_train)

y_pred = model_SVC.predict(x_test)
print("{} Accuracy: {}".format("svc",accuracy_score(y_test,y_pred)))    

In [None]:
#Save model
pkl_filename = "sentimentw_svm_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model_SVC, file)