<a href="https://colab.research.google.com/github/unt-iialab/INFO5731_Spring2020/blob/master/Assignments/INFO5731_Assignment_Four.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **INFO5731 Assignment Four**

In this assignment, you are required to conduct topic modeling, sentiment analysis based on **the dataset you created from assignment three**.

# **Question 1: Topic Modeling**

(30 points). This question is designed to help you develop a feel for the way topic modeling works, the connection to the human meanings of documents. Based on the dataset from assignment three, write a python program to **identify the top 10 topics in the dataset**. Before answering this question, please review the materials in lesson 8, especially the code for LDA and LSA. The following information should be reported:

(1) Features (top n-gram phrases) used for topic modeling.

(2) Top 10 clusters for topic modeling.

(3) Summarize and describe the topic for each cluster. 


In [69]:
# Import librariess
from sklearn.linear_model import LinearRegression
import re, string, nltk, spacy
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from gensim import corpora, models, utils

nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
df = pd.read_csv("Reviews.CSV") # read the reviews file from asignment 3
df.dropna(subset=["Reviews"], inplace=True) # drop null rows

In [63]:
## Cleaning the reviews
data = df.Reviews.values.tolist() # Convert each review to list
data = [re.sub('\s+', ' ', sentence) for sentence in data] # remose the line breakers
data = [re.sub("\'"," ", sentence) for sentence in data] # remocve the \'

def sent_to_words(reviews):
    """
    Input: sentence--> string
    Function: Tokenize the sentence and remove punctuations
    Output: tokenize and clean reviews
    """
    sentence = []
    for review in reviews:
        sentence.append(utils.simple_preprocess(str(review).encode('utf-8'), deacc=True))  # deacc=True removes punctuations
    return sentence
tokenize_reviews = list(sent_to_words(data))

In [64]:
## bigram and trigam mmodels 
bigram = models.Phrases(tokenize_reviews, min_count=5, threshold=100) # creat bigram phrases
bigram_model = models.phrases.Phraser(bigram) # bigram model
trigram_model = models.phrases.Phraser(models.Phrases(bigram[tokenize_reviews], threshold=100))

In [68]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def remove_stopwords(reviews): 
    """
    Input: list of lists of reviews
    Func: remove all stopwords
    Output: tokenize reviews without stop words
    """
    return [[word for word in utils.simple_preprocess(str(review)) if word not in stop_words] for review in reviews]

def make_bigrams(reviews):
    """
    Input: tokenize reviews
    Func: make bigrams
    Output: bigrams of reviews
    """
    return [bigram_model[review] for review in reviews]

def make_trigrams(reviews):
    """
    Input: tokenize reviews
    Func: make trigrams
    Output: trigrams of bigram reviews
    """
    return [trigram_model[bigram_model[review]] for review in reviews]

def lemmatization(reviews, allowed=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """
    Input: tokenize bigram reviews
    Func: return only Noun, adj, verb, adverbs
    Output: nouns, adj, verb, adv of reviews
    """
    output_reviews= []
    for sent in reviews:
        review = nlp(" ".join(sent)) 
        output_reviews.append([token.lemma_ for token in review if token.pos_ in allowed])
    return output_reviews

In [70]:
bigrame_reviews = make_bigrams(remove_stopwords(tokenize_reviews)) # take bigram of the Reviews without stopwords
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # initiaize the nlp english model
lemmatize_reviews = lemmatization(bigrame_reviews, ['NOUN', 'ADJ', 'VERB', 'ADV']) # nouns, adj, verb, adv of reviews

In [71]:
id2word = corpora.Dictionary(lemmatize_reviews) # Create Dictionary
corpus = [id2word.doc2bow(review) for review in lemmatize_reviews] # freq of words

In [77]:
lda_model = models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics = 10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True) # create LDA model

In [90]:
lda_model.print_topics() # print lda topics

[(0,
  '0.043*"familiar" + 0.025*"serious" + 0.024*"weapon" + 0.024*"pacing" + 0.020*"min" + 0.019*"flaw" + 0.015*"finish" + 0.013*"difference" + 0.012*"comedic_time" + 0.009*"overdone"'),
 (1,
  '0.021*"storyline" + 0.015*"hope" + 0.014*"usual" + 0.012*"leave" + 0.010*"theme" + 0.010*"man" + 0.010*"amount" + 0.009*"attempt" + 0.009*"recent" + 0.009*"year"'),
 (2,
  '0.029*"invest" + 0.027*"doubt" + 0.021*"expand" + 0.015*"casting" + 0.012*"visually_stunning" + 0.009*"concern" + 0.000*"underdeveloped" + 0.000*"priority" + 0.000*"loyal" + 0.000*"sanity"'),
 (3,
  '0.000*"reunion" + 0.000*"positively" + 0.000*"bollywood" + 0.000*"biopic" + 0.000*"useful" + 0.000*"curate" + 0.000*"album" + 0.000*"forever" + 0.000*"ye" + 0.000*"stoic"'),
 (4,
  '0.026*"middle" + 0.021*"wenwu" + 0.020*"shaun" + 0.016*"ta_lo" + 0.014*"young" + 0.014*"attack" + 0.014*"dweller" + 0.012*"death" + 0.012*"katy" + 0.011*"pendant"'),
 (5,
  '0.040*"forgettable" + 0.026*"version" + 0.021*"meh" + 0.008*"fabulous" + 0

In [83]:
lsa_model = models.LsiModel(corpus, num_topics = 10, id2word = id2word) # create lsa model over corpus

In [103]:
lsa_model.print_topics() # print lsa topics with key words

[(0,
  '0.466*"movie" + 0.353*"film" + 0.256*"marvel" + 0.237*"character" + 0.189*"well" + 0.188*"good" + 0.170*"scene" + 0.153*"great" + 0.141*"action" + 0.140*"fight"'),
 (1,
  '0.697*"movie" + -0.615*"film" + -0.129*"also" + -0.128*"well" + -0.112*"character" + 0.083*"marvel" + -0.074*"mcu" + 0.064*"good" + 0.061*"watch" + -0.060*"great"'),
 (2,
  '-0.515*"film" + -0.351*"marvel" + 0.216*"character" + -0.202*"movie" + 0.200*"go" + 0.192*"also" + 0.144*"really" + 0.129*"fight" + 0.124*"find" + 0.123*"think"'),
 (3,
  '-0.552*"marvel" + 0.358*"mcu" + 0.305*"great" + 0.199*"scene" + 0.186*"good" + 0.160*"movie" + 0.139*"really" + -0.136*"get" + -0.119*"make" + 0.104*"also"'),
 (4,
  '-0.611*"good" + 0.289*"well" + 0.213*"character" + 0.210*"movie" + -0.205*"marvel" + 0.175*"feel" + -0.152*"fight" + -0.133*"wenwu" + -0.114*"mcu" + -0.108*"shaun"'),
 (5,
  '0.368*"great" + -0.339*"film" + 0.311*"marvel" + 0.259*"really" + -0.243*"movie" + 0.204*"character" + 0.199*"action" + 0.165*"story

In [98]:
# print out topics visioned by lda and lsa
lda_topics ="""
LDA:
    1)Familia Series with weapons and min flaw
    2)Hope in storyline Theme
    3)Expanding Casting
    4)Positve anf useful reunion album
    5)Attack and Young man's death
    6)Fabulous version
    7)Adventures Ending
    8)Feel of film characters
    9)Great marvel movie with story and action
    10)Uneccessary steel breaking"""
lsa_topics = """
LSA:
    1)Marveel Movies are great action movies
    2)Marvel movie is also well charactered
    3)Marevel movies go for real fight
    4)Marvel movies has realy great scenes
    5)Good and well characted movie
    6)Marvel film with action story
    7)Good and new action scenes
    8)Characters with real fighting scenes
    9)Movie is well actioned with great fight scenes
    10)Good scenes with new characters
    """
print(lda_topics)
print(lsa_topics)


LDA:
    1)Familia Series with weapons and min flaw
    2)Hope in storyline Theme
    3)Expanding Casting
    4)Positve anf useful reunion album
    5)Attack and Young man's death
    6)Fabulous version
    7)Adventures Ending
    8)Feel of film characters
    9)Great marvel movie with story and action
    10)Uneccessary steel breaking

LSA:
    1)Marveel Movies are great action movies
    2)Marvel movie is also well charactered
    3)Marevel movies go for real fight
    4)Marvel movies has realy great scenes
    5)Good and well characted movie
    6)Marvel film with action story
    7)Good and new action scenes
    8)Characters with real fighting scenes
    9)Movie is well actioned with great fight scenes
    10)Good scenes with new characters
    


# **Question 2: Sentiment Analysis**

(30 points). Sentiment analysis also known as opinion mining is a sub field within Natural Language Processing (NLP) that builds machine learning algorithms to classify a text according to the sentimental polarities of opinions it contains, e.g., positive, negative, neutral. The purpose of this question is to develop a machine learning classifier for sentiment analysis. Based on the dataset from assignment three, write a python program to implement a sentiment classifier and evaluate its performance. Notice: **80% data for training and 20% data for testing**.  

(1) Features used for sentiment classification and explain why you select these features.

(2) Select two of the supervised learning algorithm from scikit-learn library: https://scikit-learn.org/stable/supervised_learning.html#supervised-learning, to build a sentiment classifier respectively. 

(3) Compare the performance over accuracy, precision, recall, and F1 score for the two algorithms you selected. Here is the reference of how to calculate these metrics: https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9. 

In [23]:
# Write your code here
def review_classification(rating):
    if rating == 5: return 'Very Positive'
    elif rating == 4: return 'Positive'
    elif rating == 3: return 'Neutral'
    elif rating == 2: return 'Negative'
    elif rating == 1: return 'Very Negative'

df = pd.read_csv("Amazon Reviews.CSV")
df.dropna(inplace=True, subset=['Rating', "Reviews"])
df["Sentiment"] = df["Rating"].apply(review_classification)

In [24]:
stopwords_list = stopwords.words('english') # importing stopwords
punctuations_list = string.punctuation # get punctuations
lemmatizer = WordNetLemmatizer() # initialize word lemmatizer
def preprocessing(text):
    """
    This function will clean the givern text
    """
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text + " ".join(emoticons).replace('-', '')
    tokenize_text = [lemmatizer.lemmatize(word.lower()) for word in nltk.tokenize.word_tokenize(text) if (word not in stopwords_list) and (word not in punctuations_list) and (len(word)>=2) and (word.isalnum())]
    return " ".join(tokenize_text)

df["Clean Review"] = df["Reviews"].apply(preprocessing) # Clean all the reviews got from txt file

In [25]:

tf_idf = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
tf_idf.fit(df['Clean Review'])
X = tf_idf.transform(df['Clean Review'])
encoder = LabelEncoder()
y = encoder.fit_transform(df['Rating'])


In [26]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [35]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train, y_train)

predicted_y = svm_model.predict(X_test)
svm_report = classification_report(y_test, predicted_y, output_dict = True)

print("SVM Accuracy: ", accuracy_score(y_test, predicted_y))
print("Classification Report: ")
print(classification_report(y_test, predicted_y))

SVM Accuracy:  0.7072274397713353
Classification Report: 
              precision    recall  f1-score   support

           0       0.69      0.85      0.76       531
           1       0.46      0.15      0.23       149
           2       0.51      0.25      0.34       193
           3       0.50      0.17      0.25       331
           4       0.75      0.92      0.83      1245

    accuracy                           0.71      2449
   macro avg       0.58      0.47      0.48      2449
weighted avg       0.67      0.71      0.66      2449



In [36]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
predicted_y = tree_model.predict(X_test)
tree_report = classification_report(y_test, predicted_y, output_dict = True)

print("Decision Tree Model Accuracy: ", accuracy_score(y_test, predicted_y))
print("Classification Report: ")
print(classification_report(y_test, predicted_y))

Decision Tree Model Accuracy:  0.7419354838709677
Classification Report: 
              precision    recall  f1-score   support

           0       0.78      0.78      0.78       531
           1       0.51      0.53      0.52       149
           2       0.58      0.53      0.55       193
           3       0.56      0.50      0.53       331
           4       0.82      0.85      0.83      1245

    accuracy                           0.74      2449
   macro avg       0.65      0.64      0.64      2449
weighted avg       0.74      0.74      0.74      2449



In [37]:
print("SVM Accuracy: ", svm["accuracy"], '\tvs\t', "Decision Tree Accuracy: ", tree_report["accuracy"])

{'0': {'precision': 0.7849056603773585,
  'recall': 0.783427495291902,
  'f1-score': 0.7841658812441094,
  'support': 531},
 '1': {'precision': 0.5064102564102564,
  'recall': 0.5302013422818792,
  'f1-score': 0.5180327868852459,
  'support': 149},
 '2': {'precision': 0.5828571428571429,
  'recall': 0.5284974093264249,
  'f1-score': 0.5543478260869565,
  'support': 193},
 '3': {'precision': 0.5622895622895623,
  'recall': 0.5045317220543807,
  'f1-score': 0.5318471337579618,
  'support': 331},
 '4': {'precision': 0.8156467854376452,
  'recall': 0.8457831325301205,
  'f1-score': 0.8304416403785488,
  'support': 1245},
 'accuracy': 0.7419354838709677,
 'macro avg': {'precision': 0.6504218814743931,
  'recall': 0.6384882202969415,
  'f1-score': 0.6437670536705644,
  'support': 2449},
 'weighted avg': {'precision': 0.7375784219782147,
  'recall': 0.7419354838709677,
  'f1-score': 0.7392851540083646,
  'support': 2449}}

In [43]:
print("SVM Accuracy:", round(svm_report["accuracy"], 2), '\tvs\t', 
      "Decision Tree Accuracy:", round(tree_report["accuracy"], 2))
print("SVM Prescision:", round(svm_report["weighted avg"]["precision"], 2), '\tvs\t', 
      "Decision Tree Accuracy:", round(tree_report["weighted avg"]["precision"], 2))
print("SVM Prescision:", round(svm_report["weighted avg"]["recall"], 2), '\tvs\t', 
      "Decision Tree Accuracy:", round(tree_report["weighted avg"]["recall"], 2))
print("SVM Prescision:", round(svm_report["weighted avg"]["f1-score"], 2), '\tvs\t', 
      "Decision Tree Accuracy:", round(tree_report["weighted avg"]["f1-score"], 2))

SVM Accuracy: 0.71 	vs	 Decision Tree Accuracy: 0.74
SVM Prescision: 0.67 	vs	 Decision Tree Accuracy: 0.74
SVM Prescision: 0.71 	vs	 Decision Tree Accuracy: 0.74
SVM Prescision: 0.66 	vs	 Decision Tree Accuracy: 0.74


# **Question 3: House price prediction**

(40 points). You are required to build a **regression** model to predict the house price with 79 explanatory variables describing (almost) every aspect of residential homes. The purpose of this question is to practice regression analysis, an supervised learning model. The training data, testing data, and data description files can be download here: https://github.com/unt-iialab/info5731_spring2021/blob/main/assignment/assignment4-question3-data.zip. Here is an axample for implementation: https://towardsdatascience.com/linear-regression-in-python-predict-the-bay-areas-home-price-5c91c8378878. 


In [14]:
# Write your code here
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [15]:
train_data.fillna(train_data.mean(), inplace = True)
test_data.fillna(test_data.mean(), inplace = True)

  train_data.fillna(train_data.mean(), inplace = True)
  test_data.fillna(test_data.mean(), inplace = True)


In [34]:
from sklearn.preprocessing import LabelEncoder
columns = ('GarageCond', 'LandContour', 'RoofStyle', 'RoofMatl', 'Heating', 'MiscFeature', 'SaleType', 'GarageType', 'Electrical', 
           'SaleCondition', 'Foundation', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'FireplaceQu', 'LotConfig', 'Neighborhood', 
           'Condition1', 'Condition2', 'Utilities', 'BldgType', 'HouseStyle','PoolQC', 'BsmtQual', 'BsmtCond', 'GarageQual',
           'BsmtExposure', 'ExterQual', 'ExterCond','HeatingQC', 'KitchenQual', 'BsmtFinType1','BsmtFinType2', 'Functional',
           'Fence', 'GarageFinish', 'LandSlope','LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass',
           'OverallCond', 'YrSold', 'MoSold', 'MSZoning')

for column in columns:
    encoder = LabelEncoder()
    train_data[column] = encoder.fit(list(train_data[column].values)).transform(list(train_data[column].values))
for column in columns:
    encoder = LabelEncoder()
    test_data[column] = encoder.fit(list(test_data[column].values)).transform(list(test_data[column].values))

In [39]:
X_train = train_data[train_data.columns[:80]]
y_train = train_data['SalePrice']
X_test = test_data[test_data.columns[:80]]

In [40]:

rg_model = LinearRegression()
rg_model.fit(X_train, y_train)

LinearRegression()

In [42]:
y_perdicted = rg_model.predict(X_test)
fill_data = pd.read_csv("test.csv")
fill_data["Perdicted Price"] = y_perdicted
fill_data.to_csv("Perdicted Prices.CSV", index=False)