**Installing necessary libraries**

In [1]:
!pip install pandas nltk textblob matplotlib



**Importing Libraries**

In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from transformers import pipeline

**Loading Dataset**

In [2]:
df = pd.read_csv('Review.csv')
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,1178162,4724140,5/21/13,4298113,Olivier,My stay at islam's place was really cool! Good...
1,1178162,4869189,5/29/13,6452964,Charlotte,Great location for both airport and city - gre...
2,1178162,5003196,6/6/13,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...
3,1178162,5150351,6/15/13,2215611,Marine,The room was nice and clean and so were the co...
4,1178162,5171140,6/16/13,6848427,Andrew,Great location. Just 5 mins walk from the Airp...


**Downloading NLTK resources**

In [None]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

**User defined functions for text preprocessing**

In [None]:
# Function to change case to lower
def changecase(text):
    return text.changecase()

# Function to remove non-ASCII characters
def remove_nascii(data_str):
    return ''.join(c for c in data_str if 0 < ord(c) < 127)

# Function to fix common abbreviations
def abbreviations_fix(data_str):
    data_str = data_str.changecase()
    abbreviations = {
        r'\bthats\b': 'that is',
        r'\bive\b': 'i have',
        r'\bim\b': 'i am',
        r'\bya\b': 'yeah',
        r'\bcant\b': 'can not',
        r'\bdont\b': 'do not',
        r'\bwont\b': 'will not',
        r'\bid\b': 'i would',
        r'\bwtf\b': 'what the fuck',
        r'\bwth\b': 'what the hell',
        r'\br\b': 'are',
        r'\bu\b': 'you',
        r'\bk\b': 'OK',
        r'\bsux\b': 'sucks',
        r'\bno+\b': 'no',
        r'\bcoo+\b': 'cool',
        r'\brt\b': '',
    }
    for abbr, replacement in abbreviations.items():
        data_str = re.sub(abbr, replacement, data_str)
    return data_str.strip()

# Function to clean the text (remove URLs, punctuation, numbers)
def clean(data_str):
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    num_re = re.compile('(\\d+)')
    mention_re = re.compile('@(\w+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    data_str = data_str.changecase()
    data_str = url_re.sub(' ', data_str)
    data_str = mention_re.sub(' ', data_str)
    data_str = punc_re.sub(' ', data_str)
    data_str = num_re.sub(' ', data_str)
    return " ".join(w for w in data_str.split() if alpha_num_re.match(w))

# Function to remove stopwords
def remove_stop_words(data_str):
    stops = set(stopwords.words("english"))
    return " ".join(word for word in data_str.split() if word not in stops)

# Function for part-of-speech tagging
def pos_tag(data_str):
    nn_tags = ['NN', 'NNP', 'NNPS', 'NNS']
    jj_tags = ['JJ', 'JJR', 'JJS']
    vb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    nltk_tags = nn_tags + jj_tags + vb_tags
    tagged_text = nltk.pos_tag(data_str.split())
    return " ".join(word for word, tag in tagged_text if tag in nltk_tags)

# Function for lemmatization
def lemmatize(data_str):
    lmtzr = WordNetLemmatizer()
    tagged_words = nltk.pos_tag(data_str.split())
    lemmatized_text = []
    for word, tag in tagged_words:
        pos = 'v' if tag.startswith('V') else 'n'
        lemmatized_text.append(lmtzr.lemmatize(word, pos))
    return " ".join(lemmatized_text)

# Function to compute sentiment polarity using TextBlob
def sentiment_score(comments):
    return TextBlob(comments).sentiment.polarity

**Dropping rows with null values in required columns**

In [None]:
df = df.dropna(subset=['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'])

**Applying preprocessing functions to 'comments' column**

In [None]:
df['comments'] = df['comments'].apply(changecase)
df['comments'] = df['comments'].apply(remove_nascii)
df['comments'] = df['comments'].apply(abbreviations_fix)
df['comments'] = df['comments'].apply(clean)
df['comments'] = df['comments'].apply(remove_stop_words)
df['comments'] = df['comments'].apply(pos_tag)
df['comments'] = df['comments'].apply(lemmatize)

df['sentiment_score'] = df['comments'].apply(sentiment_score)
df.head()

**Plotting sentiment score over time**

In [None]:
plt.figure(figsize=(15, 8))
plt.scatter(df['date'][:50], df['sentiment_score'][:50])
plt.xlabel('Date')
plt.ylabel('Sentiment Score')
plt.title('Sentiment Score Over Time')
plt.xticks(rotation=90)
plt.grid(True)
plt.show()

**Generate the word cloud**

In [None]:
# Combine all comments into one text
all_comments = ' '.join(df['comments'])

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_comments)

# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Comments')
plt.show()

**Load the sentiment analysis pipeline**

In [None]:
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis')

# Analyze the sentiment of the first 5 comments
for comment in df['comments'][:50]:
    result = sentiment_pipeline(comment)
    print(comment)
    print(result)