## If you want to download datasets and trained models, go to this [link](https://drive.google.com/drive/folders/1Ltqj8wiuXxrl1p_KzrzfaJkn2EDfI-M4?usp=sharing) for convenience. Download file named word2vec sentiment analysys.zip and unzip it

In [2]:
%config IPCompleter.use_jedi = False

In [4]:
import pandas as pd
import gensim
import spacy
import nltk
from bs4 import BeautifulSoup
import unidecode
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import word2vec
import numpy as np

%run contractions.ipynb

**Import origin tweet dataset without preprocessing**

In [10]:
df = pd.read_csv("./pytorch dataset and model/tweet data.csv", encoding = "ISO-8859-1")
df

**Import saved dataset after preprocessing**

In [9]:
df = pd.read_csv("./pytorch dataset and model/lastFinalTweet.csv", encoding = "ISO-8859-1")
df

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can not update his facebook b...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,i dived many time for the ball managed to save...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feel itchy and like it on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,no it is not behaving at all i am mad why am i...
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,not the whole crew
...,...,...,...,...,...,...
1596816,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,just woke up having no school is the best feel...
1596817,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,thewdbcom very cool to hear old walt interview a
1596818,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,are you ready for your mojo makeover ask me fo...
1596819,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,happy th birthday to my boo of alll time tupac...


In [11]:
review = df.text

# Preprocessing

## 1) remove spacing

In [None]:
def remove_spacing(text):
    res = " ".join(text.split())
    return res
review = list(map(remove_spacing, review))

## 2) remove accent characters

In [None]:
def remove_accent_char(text):
    res = unidecode.unidecode(text)
    return res
review = list(map(remove_accent_char, review))

## 3) Contractions

In [None]:
c_re = re.compile('(%s)' % '|'.join(CONTRACTION_MAP.keys()))
def expand_contractions(s, contractions_dict=CONTRACTION_MAP):
    def replace(match):
        return contractions_dict[match.group(0)]
    return c_re.sub(replace, s)
review = list(map(expand_contractions, review))

## 4) Remove special character

In [None]:
# def remove_special_characters(text, remove_digits=True):
#     pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
#     text = re.sub(pattern, '', text)
#     return text
# review = list(map(expand_contractions, review))

In [None]:
text ="sorry! bed time came here (GMT+1) http://is.gd/fNge"

In [None]:
def remove_tweetName(text):
    res = re.sub("@\w+","", text)
    return res
review = list(map(remove_tweetName, review))


In [None]:
def remove_httpLink(text):
    res = re.sub("http\S+", "", text)
    return res
review = list(map(remove_httpLink, review))

In [None]:
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text
review = list(map(remove_special_characters, review))

## 5) remove stopwords

In [None]:
stopwordList = set(stopwords.words('english'))
def remove_stopwords(text):
    filted_sentence = " ".join([word for word in text.split() if word not in stopwordList])
    return filted_sentence
review = list(map(remove_stopwords, review))

## 6) remove nan value after preprocessing

In [None]:
def isNaN(string):
    return string != string

nan_index =[]
for ind, sent in enumerate(review):
    if isNaN(sent):
        nan_index.append(ind)
df = df.drop(nan_index)
df = df.reset_index(drop=True)
df

In [None]:
test_review = review

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
pipeline = ["tok2vec", "tagger", "parser", "ner"]

In [None]:
for doc in nlp.pipe(texts, batch_size=2000, disable=["transformer","tok2vec" ]):
    # Do something with the doc here
    print([(ent.lemma_) for ent in doc.ents])

In [None]:
review = list(map(lambda text: text.lower(), review))

In [None]:
df.text = review

In [None]:
def lemmatize_text(text):
    text = nltk.word_tokenize(text)
    return text

iter = round(len(review) / 10000)
batch_size = 10000
res_arr = []
for i in range(1, iter + 1, 1):
    print(i)
    res = list(map(lemmatize_text, review[((i-1)*batch_size) : (i)* batch_size]))
    res_arr = res_arr + res

res_arr

In [None]:
fulltext = review
tokentext = res_arr

# Tạo word2vec model

In [None]:
word2vec_model = word2vec.Word2Vec(tokentext)

In [16]:
word2vec_model

<gensim.models.word2vec.Word2Vec at 0x1fabd9f8048>

In [None]:
word2vec_model.init_sims(replace=True)

**Save lại word2vec model sau khi train xong**

In [None]:
model_name = "word2vec.model"
word2vec_model.save(model_name)

**Các từ gần nghĩa nhất với từ hero**

In [17]:
word2vec_model.wv.most_similar("hero")

[('bass', 0.6834656000137329),
 ('teardrop', 0.6448805332183838),
 ('metallica', 0.5705269575119019),
 ('player', 0.5653506517410278),
 ('playing', 0.5631834864616394),
 ('guitar', 0.540151834487915),
 ('favourite', 0.5208714604377747),
 ('mario', 0.5193303227424622),
 ('teardrops', 0.5149649977684021),
 ('fave', 0.5140201449394226)]

In [18]:
word2vec_model.wv.index2word

['i',
 'to',
 'the',
 'a',
 'is',
 'my',
 'it',
 'and',
 'you',
 'not',
 'for',
 'in',
 'am',
 'of',
 'have',
 'on',
 'me',
 'that',
 'so',
 'but',
 'just',
 'with',
 'be',
 'at',
 'do',
 'wa',
 'are',
 'day',
 'will',
 'this',
 'now',
 'good',
 'up',
 'can',
 'get',
 'all',
 'out',
 'like',
 'go',
 'no',
 'got',
 'u',
 'love',
 'work',
 'today',
 'your',
 'too',
 'going',
 'time',
 'we',
 'back',
 'from',
 'one',
 'what',
 'lol',
 'know',
 'about',
 'im',
 'really',
 'had',
 'want',
 'see',
 'some',
 'did',
 'there',
 'night',
 'think',
 'if',
 'still',
 'new',
 'how',
 'well',
 'na',
 'they',
 'amp',
 'would',
 'need',
 'thanks',
 'home',
 'when',
 'ha',
 'oh',
 'more',
 'miss',
 'much',
 'he',
 'here',
 'off',
 'last',
 'an',
 'feel',
 'hope',
 'morning',
 'then',
 'make',
 'been',
 'tomorrow',
 'great',
 'twitter',
 'or',
 'her',
 'haha',
 'again',
 'wish',
 'its',
 'she',
 'sad',
 'come',
 'fun',
 'only',
 'why',
 'right',
 'week',
 'sleep',
 'bad',
 'very',
 'happy',
 'could',
 '

**Load word2vec model**

In [12]:
model = word2vec.Word2Vec.load("word2vec.model")
review = df.text

# Chuyển đổi text sang vector 

**Tính trung bình giá trị vector của từng câu**

In [None]:
def featureVecMethod(words, model, num_features=100):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model.wv.__getitem__(word))
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

**Hàm trả về mảng 2 chiều chứa giá trị vector của tất caả các caâu trong dataset**

In [None]:
def getAvgFeatureVecs(reviews, model, num_features=100):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
#         if counter%10000 == 0:
        print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

**Vì tính mảng 2 chiều trên hơi laâu nên khi tính xong ta sẽ lưu vào file tên là avarage vector**

In [None]:
trainDataVecs = getAvgFeatureVecs(token_sent, model, 100)
np.save('/pytorch dataset and model/avarage vector.npy', trainDataVecs)

**Load xem file có dđúng không và lưu vào biến X để chuẩn bị train**

In [10]:
X = np.load('/pytorch dataset and model/avarage vector.npy')
X.shape

(1596821, 100)

In [None]:
y = df.target
y= np.array(y)
y.shape

In [None]:
m = 0
nan_arr = []
for ind, value in enumerate(X):
    if np.isnan(value).any():
        nan_arr.append(ind)
#         print(ind)
newX = np.delete(X, nan_arr, axis=0)
newY = np.delete(y, nan_arr)
newX.shape

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(newX, newY, test_size=0.2, random_state=42)

In [11]:
print(X_train.shape, y_train.shape)

(1116218, 100) (1116218,)


In [14]:
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression(solver='lbfgs', verbose=1) 
lreg.fit(X_train, y_train) 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.3s finished


LogisticRegression(verbose=1)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

**Ta thay mô hình có độ chính xác là 73%**

In [15]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
y_pred = lreg.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[176616  62088]
 [ 65814 173862]]
              precision    recall  f1-score   support

           0       0.73      0.74      0.73    238704
           4       0.74      0.73      0.73    239676

    accuracy                           0.73    478380
   macro avg       0.73      0.73      0.73    478380
weighted avg       0.73      0.73      0.73    478380

0.7326351436096826


**Lưu và load model**

In [2]:
import pickle
pickle.dump(lreg, open("/pytorch dataset and model/logistic_classifier", 'wb'))
# pickle.dump(classifier, open("random_forest_classifier", 'wb'))

In [3]:
logistic_model = pickle.load(open("/pytorch dataset and model/logistic_classifier", 'rb'))