<a href="https://colab.research.google.com/github/kaziunt2022/assignment_04/blob/main/INFO5731_Assignment_Four.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **INFO5731 Assignment Four**

In this assignment, you are required to conduct topic modeling, sentiment analysis based on **the dataset you created from assignment three**.

# **Question 1: Topic Modeling**

(30 points). This question is designed to help you develop a feel for the way topic modeling works, the connection to the human meanings of documents. Based on the dataset from assignment three, write a python program to **identify the top 10 topics in the dataset**. Before answering this question, please review the materials in lesson 8, especially the code for LDA and LSA. The following information should be reported:

(1) Features (top n-gram phrases) used for topic modeling.

(2) Top 10 clusters for topic modeling.

(3) Summarize and describe the topic for each cluster. 


In [1]:
# Import librariess
from sklearn.linear_model import LinearRegression
import re, string, nltk, spacy
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from gensim import corpora, models, utils

nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/kaziunt2022/assignment_04/main/Reviews.CSV") # read the reviews file from asignment 3
df.dropna(subset=["Reviews"], inplace=True) # drop null rows

In [3]:
## Cleaning the reviews
data = df.Reviews.values.tolist() # Convert each review to list
data = [re.sub('\s+', ' ', sentence) for sentence in data] # remose the line breakers
data = [re.sub("\'"," ", sentence) for sentence in data] # remocve the \'

def sent_to_words(reviews):
    """
    Input: sentence--> string
    Function: Tokenize the sentence and remove punctuations
    Output: tokenize and clean reviews
    """
    sentence = []
    for review in reviews:
        sentence.append(utils.simple_preprocess(str(review).encode('utf-8'), deacc=True))  # deacc=True removes punctuations
    return sentence
tokenize_reviews = list(sent_to_words(data))

In [4]:
## bigram and trigam mmodels 
bigram = models.Phrases(tokenize_reviews, min_count=5, threshold=100) # creat bigram phrases
bigram_model = models.phrases.Phraser(bigram) # bigram model
trigram_model = models.phrases.Phraser(models.Phrases(bigram[tokenize_reviews], threshold=100))



In [5]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def remove_stopwords(reviews): 
    """
    Input: list of lists of reviews
    Func: remove all stopwords
    Output: tokenize reviews without stop words
    """
    return [[word for word in utils.simple_preprocess(str(review)) if word not in stop_words] for review in reviews]

def make_bigrams(reviews):
    """
    Input: tokenize reviews
    Func: make bigrams
    Output: bigrams of reviews
    """
    return [bigram_model[review] for review in reviews]

def make_trigrams(reviews):
    """
    Input: tokenize reviews
    Func: make trigrams
    Output: trigrams of bigram reviews
    """
    return [trigram_model[bigram_model[review]] for review in reviews]

def lemmatization(reviews, allowed=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """
    Input: tokenize bigram reviews
    Func: return only Noun, adj, verb, adverbs
    Output: nouns, adj, verb, adv of reviews
    """
    output_reviews= []
    for sent in reviews:
        review = nlp(" ".join(sent)) 
        output_reviews.append([token.lemma_ for token in review if token.pos_ in allowed])
    return output_reviews

In [6]:
bigrame_reviews = make_bigrams(remove_stopwords(tokenize_reviews)) # take bigram of the Reviews without stopwords
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # initiaize the nlp english model
lemmatize_reviews = lemmatization(bigrame_reviews, ['NOUN', 'ADJ', 'VERB', 'ADV']) # nouns, adj, verb, adv of reviews

In [7]:
id2word = corpora.Dictionary(lemmatize_reviews) # Create Dictionary
corpus = [id2word.doc2bow(review) for review in lemmatize_reviews] # freq of words

In [8]:
lda_model = models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics = 10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True) # create LDA model

In [9]:
lda_model.print_topics() # print lda topics

[(0,
  '0.021*"praise" + 0.015*"land" + 0.012*"shot" + 0.012*"silly" + 0.011*"group" + 0.011*"guess" + 0.010*"sadly" + 0.009*"dimension" + 0.009*"prove" + 0.008*"blend"'),
 (1,
  '0.016*"major" + 0.014*"narrative" + 0.009*"always" + 0.009*"white" + 0.008*"highlight" + 0.008*"leave" + 0.008*"franchise" + 0.007*"childhood" + 0.006*"damn" + 0.006*"surely"'),
 (2,
  '0.019*"also" + 0.017*"get" + 0.015*"thing" + 0.014*"new" + 0.014*"big" + 0.013*"villain" + 0.012*"say" + 0.012*"joke" + 0.011*"look" + 0.011*"star"'),
 (3,
  '0.019*"monster" + 0.018*"exactly" + 0.017*"disappoint" + 0.017*"rest" + 0.015*"reveal" + 0.014*"camera" + 0.012*"high" + 0.011*"regard" + 0.010*"demon" + 0.010*"crazy"'),
 (4,
  '0.105*"movie" + 0.047*"good" + 0.036*"scene" + 0.034*"great" + 0.033*"marvel" + 0.030*"action" + 0.028*"story" + 0.028*"character" + 0.027*"fight" + 0.027*"really"'),
 (5,
  '0.021*"mother" + 0.015*"middle" + 0.015*"sister" + 0.012*"amount" + 0.011*"mystical" + 0.011*"child" + 0.011*"bus" + 0.00

In [10]:
lsa_model = models.LsiModel(corpus, num_topics = 10, id2word = id2word) # create lsa model over corpus

In [11]:
lsa_model.print_topics() # print lsa topics with key words

[(0,
  '0.479*"movie" + 0.357*"film" + 0.257*"character" + 0.199*"good" + 0.186*"scene" + 0.159*"marvel" + 0.155*"action" + 0.152*"great" + 0.148*"also" + 0.145*"fight"'),
 (1,
  '-0.697*"movie" + 0.617*"film" + 0.139*"also" + 0.129*"character" + 0.122*"well" + 0.071*"great" + -0.067*"marvel" + -0.063*"good" + -0.063*"watch" + 0.056*"take"'),
 (2,
  '0.547*"film" + 0.290*"movie" + -0.239*"character" + -0.207*"go" + -0.178*"also" + -0.153*"really" + -0.129*"would" + 0.126*"marvel" + -0.124*"ring" + -0.120*"find"'),
 (3,
  '-0.619*"good" + -0.299*"scene" + 0.222*"well" + -0.203*"fight" + 0.183*"movie" + -0.169*"great" + -0.166*"also" + 0.137*"little" + 0.136*"feel" + 0.129*"character"'),
 (4,
  '0.410*"great" + 0.313*"really" + 0.263*"well" + 0.246*"character" + -0.234*"film" + -0.217*"go" + 0.167*"action" + 0.143*"also" + 0.121*"scene" + -0.115*"good"'),
 (5,
  '-0.401*"good" + 0.386*"scene" + -0.351*"marvel" + -0.256*"action" + -0.248*"story" + 0.216*"fight" + 0.184*"feel" + 0.173*"fil

In [12]:
# print out topics visioned by lda and lsa
lda_topics ="""
LDA:
    1)Familia Series with weapons and min flaw
    2)Hope in storyline Theme
    3)Expanding Casting
    4)Positve anf useful reunion album
    5)Attack and Young man's death
    6)Fabulous version
    7)Adventures Ending
    8)Feel of film characters
    9)Great marvel movie with story and action
    10)Uneccessary steel breaking"""
lsa_topics = """
LSA:
    1)Marveel Movies are great action movies
    2)Marvel movie is also well charactered
    3)Marevel movies go for real fight
    4)Marvel movies has realy great scenes
    5)Good and well characted movie
    6)Marvel film with action story
    7)Good and new action scenes
    8)Characters with real fighting scenes
    9)Movie is well actioned with great fight scenes
    10)Good scenes with new characters
    """
print(lda_topics)
print(lsa_topics)


LDA:
    1)Familia Series with weapons and min flaw
    2)Hope in storyline Theme
    3)Expanding Casting
    4)Positve anf useful reunion album
    5)Attack and Young man's death
    6)Fabulous version
    7)Adventures Ending
    8)Feel of film characters
    9)Great marvel movie with story and action
    10)Uneccessary steel breaking

LSA:
    1)Marveel Movies are great action movies
    2)Marvel movie is also well charactered
    3)Marevel movies go for real fight
    4)Marvel movies has realy great scenes
    5)Good and well characted movie
    6)Marvel film with action story
    7)Good and new action scenes
    8)Characters with real fighting scenes
    9)Movie is well actioned with great fight scenes
    10)Good scenes with new characters
    


# **Question 2: Sentiment Analysis**

(30 points). Sentiment analysis also known as opinion mining is a sub field within Natural Language Processing (NLP) that builds machine learning algorithms to classify a text according to the sentimental polarities of opinions it contains, e.g., positive, negative, neutral. The purpose of this question is to develop a machine learning classifier for sentiment analysis. Based on the dataset from assignment three, write a python program to implement a sentiment classifier and evaluate its performance. Notice: **80% data for training and 20% data for testing**.  

(1) Features used for sentiment classification and explain why you select these features.

(2) Select two of the supervised learning algorithm from scikit-learn library: https://scikit-learn.org/stable/supervised_learning.html#supervised-learning, to build a sentiment classifier respectively. 

(3) Compare the performance over accuracy, precision, recall, and F1 score for the two algorithms you selected. Here is the reference of how to calculate these metrics: https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9. 

In [14]:
# Write your code here
def review_classification(rating):
  """
  This function will return:
    very positive for 5
    positive for 4
    neutral for 3
    negative for 2
    very negative for 1
  """
    if rating == 5: return 'Very Positive'
    elif rating == 4: return 'Positive'
    elif rating == 3: return 'Neutral'
    elif rating == 2: return 'Negative'
    elif rating == 1: return 'Very Negative'

df = pd.read_csv("https://raw.githubusercontent.com/kaziunt2022/assignment_04/main/Amazon%20Reviews.CSV")
df.dropna(inplace=True, subset=['Rating', "Reviews"])
df["Sentiment"] = df["Rating"].apply(review_classification)

In [15]:
stopwords_list = stopwords.words('english') # importing stopwords
punctuations_list = string.punctuation # get punctuations
lemmatizer = WordNetLemmatizer() # initialize word lemmatizer
def preprocessing(text):
    """
    This function will clean the givern text
    """
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text + " ".join(emoticons).replace('-', '')
    tokenize_text = [lemmatizer.lemmatize(word.lower()) for word in nltk.tokenize.word_tokenize(text) if (word not in stopwords_list) and (word not in punctuations_list) and (len(word)>=2) and (word.isalnum())]
    return " ".join(tokenize_text)

df["Clean Review"] = df["Reviews"].apply(preprocessing) # Clean all the reviews got from txt file

In [16]:
"""
I am using yf_idf feature for it's simple and fast calculation of complex data
and getting the the word score efficientlly
"""
tf_idf = TfidfVectorizer(ngram_range=(1,2), max_features=1000) # inititate tf_idf model for uni gram to bi gram
tf_idf.fit(df['Clean Review']) # apply tf_idf model to cleaned data
X = tf_idf.transform(df['Clean Review']) # transfor our data into vector form
encoder = LabelEncoder()
y = encoder.fit_transform(df['Rating']) # transform ratings


In [17]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) # Split the data aas: 80% training, 20% testing

In [18]:
svm_model = svm.SVC(kernel='linear') # Inititate SVC linear model for supervised modeling
svm_model.fit(X_train, y_train) # fit the data to model

predicted_y = svm_model.predict(X_test) # perdict the data
svm_report = classification_report(y_test, predicted_y, output_dict = True) # get the classification report in dict format

print("SVM Accuracy: ", accuracy_score(y_test, predicted_y)) # SVM accuracy
print("Classification Report: ")
print(classification_report(y_test, predicted_y)) # print report

SVM Accuracy:  0.7170273581053491
Classification Report: 
              precision    recall  f1-score   support

           0       0.68      0.84      0.75       533
           1       0.53      0.16      0.24       152
           2       0.47      0.23      0.31       178
           3       0.49      0.15      0.24       317
           4       0.77      0.94      0.85      1269

    accuracy                           0.72      2449
   macro avg       0.59      0.47      0.48      2449
weighted avg       0.68      0.72      0.67      2449



In [19]:
tree_model = DecisionTreeClassifier() # inititate decision tree model for supervised learning
tree_model.fit(X_train, y_train) # fit  the data

predicted_y = tree_model.predict(X_test) # perdict the data
tree_report = classification_report(y_test, predicted_y, output_dict = True) # get report of perdiction

print("Decision Tree Model Accuracy: ", accuracy_score(y_test, predicted_y)) # accuracy of the tree model
print("Classification Report: ")
print(classification_report(y_test, predicted_y)) # print out report

Decision Tree Model Accuracy:  0.749693752552062
Classification Report: 
              precision    recall  f1-score   support

           0       0.77      0.77      0.77       533
           1       0.62      0.47      0.54       152
           2       0.53      0.52      0.52       178
           3       0.57      0.49      0.53       317
           4       0.82      0.87      0.84      1269

    accuracy                           0.75      2449
   macro avg       0.66      0.62      0.64      2449
weighted avg       0.74      0.75      0.74      2449



In [21]:
# compare svm results with tree results
print("SVM Accuracy:", round(svm_report["accuracy"], 2), '\tvs\t', 
      "Decision Tree Accuracy:", round(tree_report["accuracy"], 2))
print("SVM Prescision:", round(svm_report["weighted avg"]["precision"], 2), '\tvs\t', 
      "Decision Tree Accuracy:", round(tree_report["weighted avg"]["precision"], 2))
print("SVM Prescision:", round(svm_report["weighted avg"]["recall"], 2), '\tvs\t', 
      "Decision Tree Accuracy:", round(tree_report["weighted avg"]["recall"], 2))
print("SVM Prescision:", round(svm_report["weighted avg"]["f1-score"], 2), '\tvs\t', 
      "Decision Tree Accuracy:", round(tree_report["weighted avg"]["f1-score"], 2))

SVM Accuracy: 0.72 	vs	 Decision Tree Accuracy: 0.75
SVM Prescision: 0.68 	vs	 Decision Tree Accuracy: 0.74
SVM Prescision: 0.72 	vs	 Decision Tree Accuracy: 0.75
SVM Prescision: 0.67 	vs	 Decision Tree Accuracy: 0.74


# **Question 3: House price prediction**

(40 points). You are required to build a **regression** model to predict the house price with 79 explanatory variables describing (almost) every aspect of residential homes. The purpose of this question is to practice regression analysis, an supervised learning model. The training data, testing data, and data description files can be download here: https://github.com/unt-iialab/info5731_spring2021/blob/main/assignment/assignment4-question3-data.zip. Here is an axample for implementation: https://towardsdatascience.com/linear-regression-in-python-predict-the-bay-areas-home-price-5c91c8378878. 


In [22]:
# Write your code here
train_data = pd.read_csv("https://raw.githubusercontent.com/kaziunt2022/assignment_04/main/train.csv") # train data
test_data = pd.read_csv("https://raw.githubusercontent.com/kaziunt2022/assignment_04/main/test.csv") # test data

In [23]:
train_data.fillna(train_data.mean(), inplace = True) # remove null rows
test_data.fillna(test_data.mean(), inplace = True) # remove null rows

In [24]:
# encode all the columns in string form to digit form
from sklearn.preprocessing import LabelEncoder
columns = ('GarageCond', 'LandContour', 'RoofStyle', 'RoofMatl', 'Heating', 'MiscFeature', 'SaleType', 'GarageType', 'Electrical', 
           'SaleCondition', 'Foundation', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'FireplaceQu', 'LotConfig', 'Neighborhood', 
           'Condition1', 'Condition2', 'Utilities', 'BldgType', 'HouseStyle','PoolQC', 'BsmtQual', 'BsmtCond', 'GarageQual',
           'BsmtExposure', 'ExterQual', 'ExterCond','HeatingQC', 'KitchenQual', 'BsmtFinType1','BsmtFinType2', 'Functional',
           'Fence', 'GarageFinish', 'LandSlope','LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass',
           'OverallCond', 'YrSold', 'MoSold', 'MSZoning')

for column in columns:
    encoder = LabelEncoder()
    train_data[column] = encoder.fit(list(train_data[column].values)).transform(list(train_data[column].values))
for column in columns:
    encoder = LabelEncoder()
    test_data[column] = encoder.fit(list(test_data[column].values)).transform(list(test_data[column].values))

In [25]:
# split the data such data first 80 columns belong to X and the last column will goes to y
X_train = train_data[train_data.columns[:80]]
y_train = train_data['SalePrice']
# store X test data
X_test = test_data[test_data.columns[:80]]

In [30]:
rg_model = LinearRegression() # initiate linear regression model
rg_model.fit(X_train, y_train) # fit the training data into model

LinearRegression()

In [28]:
y_perdicted = rg_model.predict(X_test) # perdict data for test x
fill_data = pd.read_csv("https://raw.githubusercontent.com/kaziunt2022/assignment_04/main/test.csv") # import test document for further documentation
fill_data["Perdicted Price"] = y_perdicted # store the predicted results
fill_data.to_csv("Perdicted Prices.CSV", index=False) # save csv file