In [None]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split,StratifiedKFold
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.pipeline import make_pipeline, Pipeline
import nltk
from sklearn.model_selection import GridSearchCV
nltk.download('stopwords')
from textblob import TextBlob
from sklearn.metrics import classification_report, confusion_matrix  
# Global Parameters
stop_words = set(stopwords.words('french'))
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def load_dataset(filename, cols):
    dataset = pd.read_csv(filename, encoding='latin-1')
    dataset.columns = cols
    return dataset

In [None]:
dataset = load_dataset("/content/Train (7).csv",['ID','text','label'])

In [None]:
dataset.head()

Unnamed: 0,ID,text,label
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1
3,U0TTYY8,ak slouma,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1


In [None]:
dataset.shape

(70000, 3)

In [None]:
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

In [None]:
def preprocess_text(tweet):
    tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    # stemming&lemmmatizing
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    
    return " ".join(filtered_words)

In [None]:
dataset['text']=dataset['text'].apply(lambda x : preprocess_text(x))

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
dataset['text']=dataset['text'].apply(lambda x : remove_emoji(x))

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [None]:
dataset['text']=dataset['text'].apply(lambda x : remove_html(x))

In [None]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(encoding='utf-8',
                       ngram_range=(1,1),
                       max_features=5000,
                       norm='l2',sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [None]:
# Load dataset
dataset = remove_unwanted_cols(dataset,['ID'])
#Preprocess data
dataset.text = dataset['text'].apply(preprocess_text)
dataset.head()

Unnamed: 0,text,label
0,3sbaaaaaaaaaaaaaaaaaaaa lek seim riahi 3sbaaaa...,-1
1,cha3eb fey9elkoum menghir ta7ayoul kressi,-1
2,bereau degage nathef ya slim walahi ya7chiw fi...,-1
3,ak slouma,1
4,entom titmanou lina a7na 3iid moubarik a7na ch...,-1


In [None]:
# Split dataset into Train,Test
# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(dataset.iloc[:,0]).ravel())
X = tf_vector.transform(np.array(dataset.iloc[:,0]).ravel())
y = np.array(dataset.iloc[:,1]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [None]:
#from sklearn.model_selection import GridSearchCV

In [None]:
#param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

In [None]:
#print(grid.best_estimator_)

In [None]:
#grid_predictions = grid.predict(X_test)
#print(confusion_matrix(y_test,grid_predictions))

[[1947    5 1301]
 [ 227    9  662]
 [ 461   11 7259]]


In [None]:
clf=SVC()
clf.fit(X_train, y_train)
y_predict_clf = clf.predict(X_test)
print(accuracy_score(y_test, y_predict_clf))

In [None]:
test_file_name = "/content/Test (7).csv"
test_ds = load_dataset(test_file_name, ["ID","text"])
test_ds = remove_unwanted_cols(test_ds, ["ID"])
test_ds.head()

Unnamed: 0,text
0,barcha aaindou fiha hak w barcha teflim kadhalik
1,ye gernabou ye 9a7ba
2,saber w barra rabbi m3ak 5ouya
3,cha3Ã©bbb ta7aaaaannnnnnnnnnn tfouuhh
4,rabi y5alihoulek w yfar7ek bih w inchallah itc...


In [None]:
# Creating text feature
test_ds.text = test_ds["text"].apply(preprocess_text)
test_feature = tf_vector.transform(np.array(test_ds.iloc[:,0]).ravel())
# Using svm for prediction
test_prediction_clf = clf.predict(test_feature)
print(test_prediction_clf)

[-1 -1  1 ...  1 -1  1]


In [None]:
# Submission
test_ds= "/content/Test (7).csv"
submission = load_dataset(test_ds, ["ID","label"])
submission["label"] =test_prediction_clf
submission.to_csv("submission.csv",index=False)

In [None]:
submission.head()

Unnamed: 0,ID,label
0,2DDHQW9,-1
1,5HY6UEY,-1
2,ATNVUJX,1
3,Q9XYVOQ,1
4,TOAHLRH,1
