<a href="https://colab.research.google.com/github/ithabibi/Persian-Opinion-Mining-and-Sentiment-Analysis/blob/main/use-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Persian Sentiment Analysis With Fasttext language Model and LSTM neural network
### Persian sentiment analysis step by step guide


---


so there are 3 steps we going through with each other 

## Step1 Load fasttext model

In [None]:
!pip install pybind11==2.11.1
!pip install fasttext==0.9.2 

#!pip install keras==2.14.0
!pip install tensorflow==2.12.0 #For Deep Learning
!pip install keras==2.12.0 #A wrapper for TensorFlow for simplicity

!pip install hazm==0.7.0
!pip install pandas==1.5.3
!pip install numpy==1.23

import pandas
import random
import numpy as np
import hazm
import keras.backend as K
import fasttext 

In [None]:
#load and unzip ELM
!rm -rf /content/cc.fa.300.bin.gz
!rm -rf /content/cc.fa.300.bin

!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz
!gunzip /content/cc.fa.300.bin.gz

%time
fasttext_model = fasttext.load_model("/content/cc.fa.300.bin")

## Step 2 Load LSTM learned model

In [None]:
#load and unzip learned model
!rm -rf /content/learned-query-sentiment-fasttext.model.zip
!rm -rf /content/learned-query-sentiment-fasttext.model

!wget https://raw.githubusercontent.com/ithabibi/Persian-Opinion-Mining-and-Sentiment-Analysis/main/learned-query-sentiment-fasttext.model.zip
!unzip /content/learned-query-sentiment-fasttext.model.zip
embedding_dim = 300 #@param {type:"integer"} #The number 300, is the dimensions of the model
max_vocab_token = 8 #@param {type:"integer"}

from keras.models import load_model

#del model  # deletes the existing model

LSTM_model = load_model('/content/learned-query-sentiment-fasttext.model')

## Step 3 enter persian text and booooom!!!

In [None]:
#@title using model
user_text = "\u062E\u06CC\u0644\u06CC \u06AF\u0648\u0634\u06CC\u0647 \u062E\u0648\u0628\u06CC\u0647. \u062A\u0634\u062E\u06CC\u0635 \u0686\u0647\u0631\u0647 \u062F\u0627\u0631\u0647. \u062F\u0627\u062E\u0644 \u062C\u0639\u0628\u0647 \u06A9\u0627\u0648\u0631 \u06AF\u0648\u0634\u06CC \u0648 \u0645\u062D\u0627\u0641\u0638 \u0635\u0641\u062D\u0647 \u062F\u0627\u0631\u0647. \u0645\u0646 \u062F\u06CC\u0631\u0648\u0632 \u0628\u0647 \u062F\u0633\u062A\u0645 \u0631\u0633\u06CC\u062F\u0647 \u0639\u0627\u0644\u06CC\u0647 \u0645\u0631\u0633\u06CC \u0627\u0632 \u062F\u06CC\u062C\u06CC \u06A9\u0627\u0644\u0627" #@param {type:"string"}
from IPython.core.display import display, HTML
_normalizer = hazm.Normalizer()
if not user_text=="":
  normal_text = _normalizer.normalize(user_text)
  tokenized_text = hazm.word_tokenize(normal_text)
  
  # create and Prepare three dimension tensor (1,8,300) with zero value : (1,number_of_words, dimension_of_fasttext)
  vector_text = np.zeros((1,max_vocab_token,embedding_dim),dtype=K.floatx())

  for vocabs in range(0,len(tokenized_text)):
    if vocabs >= max_vocab_token:
      break # If the comment is more than 8 words, only the first 8 words will be considered
    if tokenized_text[vocabs] not in fasttext_model.words:
      continue # If vocab does not exist in fasttext, every 300 elements of that word's vector remain zero
    
    vector_text[0, vocabs, :] = fasttext_model.get_word_vector(tokenized_text[vocabs])

  # print(vector_text.shape)
  # print(vector_text)
  result = LSTM_model.predict(vector_text) # the result has two element: [0][1] and [0][0]
  pos_percent = str(int(result[0][1]*100))+" % üòç"
  neg_percent = str(int(result[0][0]*100))+" % ü§ï"
  display(HTML("<div style='text-align: center'><div style='display:inline-block'><img height='64px' width='64px' src='https://images.rawpixel.com/image_png_1000/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTEwL3JtNTg2LWlubG92ZWZhY2UtMDFfMS1sOWQzYzlxMC5wbmc.png'/><h4>{}</h4></div> | <div style='display:inline-block'><img height='64px' width='64px' src='https://images.rawpixel.com/image_png_1000/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTEwL3JtNTg2LWNyeWluZ2ZhY2UtMDFfMi1sOWQzYnh0MC5wbmc.png'/><h4>{}</h4></div></div>".format(pos_percent,neg_percent)))
else:
  print("Please enter your text")

## Step 3.1 enter batch persian text

In [None]:
#@title laod test Dataset
!wget https://raw.githubusercontent.com/ithabibi/Persian-Opinion-Mining-and-Sentiment-Analysis/main/related-query-whit-lexion.csv

# load and read sentiment_tagged dataset.csv file in tÿße path ./content/ in google colab. 
# this dataset include three element: Query,Score,Suggestion. Query is feature and Suggestion is label.
csv_dataset = pandas.read_csv("/content/related-query-whit-lexion.csv")

def CleanPersianText(text):
  _normalizer = hazm.Normalizer()
  text = _normalizer.normalize(text)
  return text

# Cleansing the dataset and creating a new list with two elements: "Query" and "suggestion"filde. (but without the third element: "Score")
# The new list is created by the zip Query --> x= zip(csv_dataset['Query'],csv_dataset['Suggestion'])
# valu of suggestion is 1,2,3 or positive,negative,neutral
revlist = list(map(lambda x: [CleanPersianText(x[0]),"1","2"],zip(csv_dataset['Query'],csv_dataset['Suggestion'],csv_dataset['Score'])))

# print number of element exist in positive, neutral, negative, revlist list 
print("*" * 88)
print("Total dataset count {}".format(len(revlist)))

In [None]:
#@title Result
from IPython.core.display import display, HTML
_normalizer = hazm.Normalizer()
for item in range(0,100): #len(revlist)
  user_text = revlist[item][0]
  if not user_text=="":
    normal_text = _normalizer.normalize(user_text)
    tokenized_text = hazm.word_tokenize(normal_text)
    # create and Prepare three dimension tensor (1,8,300) with zero value : (1,number_of_words, dimension_of_fasttext)
    vector_text = np.zeros((1,max_vocab_token,embedding_dim),dtype=K.floatx())

    for vocabs in range(0,len(tokenized_text)):
      if vocabs >= max_vocab_token:
        break # If the comment is more than 8 words, only the first 8 words will be considered
      if tokenized_text[vocabs] not in fasttext_model.words:
        continue # If vocab does not exist in fasttext, every 300 elements of that word's vector remain zero
      vector_text[0, vocabs, :] = fasttext_model.get_word_vector(tokenized_text[vocabs])
    
    result = LSTM_model.predict(vector_text, verbose='0',workers=10,use_multiprocessing=True,max_queue_size=100) # the result has two element: [0][1] and [0][0]
    pos_percent = str(int(result[0][1]*100))
    neg_percent = str(int(result[0][0]*100))
    #display(HTML("<div style='text-align: center'><div style='display:inline-block'><img height='64px' width='64px' src='https://images.rawpixel.com/image_png_1000/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTEwL3JtNTg2LWlubG92ZWZhY2UtMDFfMS1sOWQzYzlxMC5wbmc.png'/><h4>{}</h4></div> | <div style='display:inline-block'><img height='64px' width='64px' src='https://images.rawpixel.com/image_png_1000/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTEwL3JtNTg2LWNyeWluZ2ZhY2UtMDFfMi1sOWQzYnh0MC5wbmc.png'/><h4>{}</h4></div></div>".format(pos_percent,neg_percent)))
    print(str(item) + ": " + pos_percent +"%üòç" +" " + neg_percent +"%ü§ï" + " " + revlist[item][0] , "\n")
  else:
    print("Please enter your text")
print("end")

⁄©ÿØ ÿ≤€åÿ± ÿ±ÿß ÿ®Ÿá ŸÜÿ≠Ÿà€å ÿ™ÿ∫€å€åÿ± ÿ®ÿØŸá ⁄©Ÿá ÿÆÿ±Ÿàÿ¨€å result ÿ®Ÿá revlist ÿßÿ∂ÿßŸÅŸá ÿ¥ŸàÿØ Ÿà ŸÜŸáÿß€åÿ™ÿß ÿ®Ÿá ÿπŸÜŸàÿßŸÜ €å⁄© ŸÅÿß€åŸÑ csv ÿ∞ÿÆ€åÿ±Ÿá ÿ¥ŸàÿØ.

In [None]:
import pandas as pd
import hazm
import numpy as np

# ÿ®ÿßÿ±⁄Øÿ∞ÿßÿ±€å ÿØÿßÿØŸá‚ÄåŸáÿß
csv_dataset = pd.read_csv("/content/merged_all_operator-data.from2009to2023B.csv")

def CleanPersianText(text):
    _normalizer = hazm.Normalizer()
    text = _normalizer.normalize(text)
    return text

# ÿß€åÿ¨ÿßÿØ revlist
revlist = list(map(lambda x: ["1","2","3","4","5","6", CleanPersianText(x[6])], 
                   zip(csv_dataset['index'], csv_dataset['top25'], csv_dataset['value'], 
                       csv_dataset['date'], csv_dataset['keyword'], csv_dataset['get_type'], 
                       csv_dataset['Query'])))

# ŸÜŸÖÿß€åÿ¥ ÿ™ÿπÿØÿßÿØ ⁄©ŸÑ ÿØÿßÿØŸá‚ÄåŸáÿß
print("*" * 88)
print("Total dataset count {}".format(len(revlist)))

# ÿ¢ŸÖÿßÿØŸá‚Äåÿ≥ÿßÿ≤€å ÿ®ÿ±ÿß€å Ÿæ€åÿ¥‚Äåÿ®€åŸÜ€å
_normalizer = hazm.Normalizer()
results = []  # ŸÑ€åÿ≥ÿ™ ÿ®ÿ±ÿß€å ÿ∞ÿÆ€åÿ±Ÿá ŸÜÿ™ÿß€åÿ¨

for item in range(0, 50):  # len(revlist)
    user_text = revlist[item][6]
    if user_text != "":
        normal_text = _normalizer.normalize(user_text)
        tokenized_text = hazm.word_tokenize(normal_text)

        # ÿß€åÿ¨ÿßÿØ ÿ™ŸÜÿ≥Ÿàÿ± ÿ≥Ÿá ÿ®ÿπÿØ€å (1,8,300) ÿ®ÿß ŸÖŸÇÿØÿßÿ± ÿµŸÅÿ±
        vector_text = np.zeros((1, max_vocab_token, embedding_dim), dtype=K.floatx())

        for vocabs in range(0, len(tokenized_text)):
            if vocabs >= max_vocab_token:
                break  # ÿß⁄Øÿ± ⁄©ÿßŸÖŸÜÿ™ ÿ®€åÿ¥ÿ™ÿ± ÿßÿ≤ 8 ⁄©ŸÑŸÖŸá ÿ®ÿßÿ¥ÿØÿå ŸÅŸÇÿ∑ 8 ⁄©ŸÑŸÖŸá ÿßŸàŸÑ ÿØÿ± ŸÜÿ∏ÿ± ⁄Øÿ±ŸÅÿ™Ÿá ŸÖ€å‚Äåÿ¥ŸàÿØ
            if tokenized_text[vocabs] not in fasttext_model.words:
                continue  # ÿß⁄Øÿ± Ÿàÿß⁄òŸá ÿØÿ± fasttext Ÿàÿ¨ŸàÿØ ŸÜÿØÿßÿ¥ÿ™Ÿá ÿ®ÿßÿ¥ÿØÿå Ÿáÿ± 300 ÿπŸÜÿµÿ± ÿ¢ŸÜ Ÿà⁄©ÿ™Ÿàÿ± ÿµŸÅÿ± ÿ®ÿßŸÇ€å ŸÖ€å‚ÄåŸÖÿßŸÜÿØ
            vector_text[0, vocabs, :] = fasttext_model.get_word_vector(tokenized_text[vocabs])

        result = LSTM_model.predict(vector_text, verbose='0', workers=10, use_multiprocessing=True, max_queue_size=100)
        pos_percent = str(int(result[0][1] * 100))
        neg_percent = str(int(result[0][0] * 100))

        # ÿßÿ∂ÿßŸÅŸá ⁄©ÿ±ÿØŸÜ ŸÜÿ™ÿß€åÿ¨ ÿ®Ÿá revlist
        revlist[item].append(pos_percent)
        revlist[item].append(neg_percent)

# ÿ™ÿ®ÿØ€åŸÑ revlist ÿ®Ÿá DataFrame
columns = ['Index', 'Top25', 'Value', 'Date', 'Keyword', 'Get Type', 'Comment', 'Positive Percentage', 'Negative Percentage']
results_df = pd.DataFrame(revlist, columns=columns)

# ÿ∞ÿÆ€åÿ±Ÿá DataFrame ÿ®Ÿá ÿπŸÜŸàÿßŸÜ €å⁄© ŸÅÿß€åŸÑ CSV
results_df.to_csv('results_with_predictions.csv', index=False)

print("Results saved to results_with_predictions.csv")


⁄©ÿØ ÿ≤€åÿ± ÿ±ÿß ÿ®Ÿá ŸÜÿ≠Ÿà€å ÿ™ÿ∫€å€åÿ± ÿ®ÿØŸá ⁄©Ÿá ÿÆÿ±Ÿàÿ¨€å result ÿ®Ÿá revlist ÿßÿ∂ÿßŸÅŸá ÿ¥ŸàÿØ ÿ®Ÿá ÿµŸàÿ±ÿ™€å ⁄©Ÿá €å⁄© ŸÅ€åŸÑÿØ ÿßÿ≠ÿ≥ÿßÿ≥ ÿßÿ∂ÿßŸÅŸá ÿ¥ŸàÿØ Ÿà ÿß⁄Øÿ± ŸÖÿ´ÿ®ÿ™ ÿ®ÿßÿ¥ÿØ ÿØÿ±ŸàŸÜ ÿ¢ŸÜ ÿ®ÿß ÿπÿ®ÿßÿ±ÿ™ positive Ÿà ÿß⁄Øÿ± ŸÖŸÜŸÅ€å ÿßÿ≥ÿ™ ÿ®ÿß ÿπÿ®ÿßÿ±ÿ™ negative Ÿæÿ± ÿ¥ŸàÿØ Ÿà ŸÜŸáÿß€åÿ™ÿß ÿ®Ÿá ÿπŸÜŸàÿßŸÜ €å⁄© ŸÅÿß€åŸÑ csv ÿ∞ÿÆ€åÿ±Ÿá ÿ¥ŸàÿØ.

In [None]:
import pandas as pd
import hazm
import numpy as np

# ÿ®ÿßÿ±⁄Øÿ∞ÿßÿ±€å ÿØÿßÿØŸá‚ÄåŸáÿß
csv_dataset = pd.read_csv("/content/merged_all_operator-data.from2009to2023B.csv")

def CleanPersianText(text):
    _normalizer = hazm.Normalizer()
    text = _normalizer.normalize(text)
    return text

# ÿß€åÿ¨ÿßÿØ revlist
revlist = list(map(lambda x: ["1", "2", "3", "4", "5", "6", CleanPersianText(x[6])], 
                   zip(csv_dataset['index'], csv_dataset['top25'], csv_dataset['value'], 
                       csv_dataset['date'], csv_dataset['keyword'], csv_dataset['get_type'], 
                       csv_dataset['Query'])))

# ŸÜŸÖÿß€åÿ¥ ÿ™ÿπÿØÿßÿØ ⁄©ŸÑ ÿØÿßÿØŸá‚ÄåŸáÿß
print("*" * 88)
print("Total dataset count {}".format(len(revlist)))

_normalizer = hazm.Normalizer()

for item in range(0, 50):  # len(revlist)
    user_text = revlist[item][6]
    if user_text != "":
        normal_text = _normalizer.normalize(user_text)
        tokenized_text = hazm.word_tokenize(normal_text)

        # ÿß€åÿ¨ÿßÿØ ÿ™ŸÜÿ≥Ÿàÿ± ÿ≥Ÿá ÿ®ÿπÿØ€å (1,8,300) ÿ®ÿß ŸÖŸÇÿØÿßÿ± ÿµŸÅÿ±
        vector_text = np.zeros((1, max_vocab_token, embedding_dim), dtype=K.floatx())

        for vocabs in range(0, len(tokenized_text)):
            if vocabs >= max_vocab_token:
                break  # ÿß⁄Øÿ± ⁄©ÿßŸÖŸÜÿ™ ÿ®€åÿ¥ÿ™ÿ± ÿßÿ≤ 8 ⁄©ŸÑŸÖŸá ÿ®ÿßÿ¥ÿØÿå ŸÅŸÇÿ∑ 8 ⁄©ŸÑŸÖŸá ÿßŸàŸÑ ÿØÿ± ŸÜÿ∏ÿ± ⁄Øÿ±ŸÅÿ™Ÿá ŸÖ€å‚Äåÿ¥ŸàÿØ
            if tokenized_text[vocabs] not in fasttext_model.words:
                continue  # ÿß⁄Øÿ± Ÿàÿß⁄òŸá ÿØÿ± fasttext Ÿàÿ¨ŸàÿØ ŸÜÿØÿßÿ¥ÿ™Ÿá ÿ®ÿßÿ¥ÿØÿå Ÿáÿ± 300 ÿπŸÜÿµÿ± ÿ¢ŸÜ Ÿà⁄©ÿ™Ÿàÿ± ÿµŸÅÿ± ÿ®ÿßŸÇ€å ŸÖ€å‚ÄåŸÖÿßŸÜÿØ
            vector_text[0, vocabs, :] = fasttext_model.get_word_vector(tokenized_text[vocabs])

        result = LSTM_model.predict(vector_text, verbose='0', workers=10, use_multiprocessing=True, max_queue_size=100)
        
        pos_percent = str(int(result[0][1] * 100))
        neg_percent = str(int(result[0][0] * 100))

        # ÿ™ÿπ€å€åŸÜ ÿßÿ≠ÿ≥ÿßÿ≥ Ÿà ÿßÿ∂ÿßŸÅŸá ⁄©ÿ±ÿØŸÜ ÿ®Ÿá revlist
        sentiment = "positive" if result[0][1] > result[0][0] else "negative"
        revlist[item].append(sentiment)

# ÿ™ÿ®ÿØ€åŸÑ revlist ÿ®Ÿá DataFrame
columns = ['Index', 'Top25', 'Value', 'Date', 'Keyword', 'Get Type', 'Comment', 'Sentiment']
results_df = pd.DataFrame(revlist, columns=columns)

# ÿ∞ÿÆ€åÿ±Ÿá DataFrame ÿ®Ÿá ÿπŸÜŸàÿßŸÜ €å⁄© ŸÅÿß€åŸÑ CSV
results_df.to_csv('results_with_sentiment.csv', index=False)

print("Results saved to results_with_sentiment.csv")
