# Data Visualization preview
You can see full in behance --> https://www.behance.net/gallery/141471583/Tinder-data-Visualization

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from tqdm import tqdm
from collections import Counter

import torch

In [None]:
df = pd.read_csv('tinder_google_play_reviews.csv', 
                 encoding='utf8', usecols=['content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'repliedAt'])
df.rename(columns={'content':'review', 'score':'score', 'thumbsUpCount':'up', 'reviewCreatedVersion':'version', 'at':'timestap', 'repliedAt':'replytimestap'}, inplace=True)
df

In [None]:
df = df[df['review'].notna()]
df.isnull().sum()

# Convert in datetime

In [None]:
df['timestap'] = pd.to_datetime(df['timestap'], utc=True)

df['date'] = df['timestap'].apply(lambda r:r.date)
df['year'] = df['timestap'].apply(lambda r:r.year)

df = df[df['year'] != 2022]
df

# Plot 5 and 1 scores across years 

In [None]:
dfr = df.groupby(['score', 'year'], dropna=True, as_index=False).size().sort_values('year', ascending=False)
dfr = dfr[dfr['score']!=0]
dfr = dfr[dfr['score']!=2]
dfr = dfr[dfr['score']!=3]
dfr = dfr[dfr['score']!=4]

fig = px.line(dfr, x='year', y='size', color='score', text='score')
fig.update_traces(textposition='top center')
fig.show()

In [None]:
#removing data without version

df['version'].fillna('0', inplace=True)
df['versionshort'] = df['version'].apply(lambda x: x.split('.')[0])

df = df[df['versionshort']!= '0']
df['versionshort'].value_counts()

# Plot app version vs review score 

In [None]:
dfs = df.groupby(['score', 'versionshort'], dropna=True, as_index=False).size()

fig = px.scatter(dfs, x='versionshort', y='score', size='size', color='score', title='App version vs Score review', size_max=30, opacity=0.7,
                 category_orders={'versionshort': [1,2,3,4,5,6,7,8,9,10,11,12,13]}, labels={'score':'Count'},
                 text='size').update_xaxes(title='Version').update_yaxes(title='Score')
fig.update_layout(legend_title_text='Size', uniformtext_minsize=3, uniformtext_mode='hide')
fig.update_traces(textposition='top center', textfont_size=10, textfont_color='grey')
fig.show()

In [None]:
df = df.reset_index()

# Sentimen analysis (Textblob, Vader, BERT)

In [None]:
tqdm.pandas()
df['sentimentBlob'] = df['review'].progress_apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
analyser = SentimentIntensityAnalyzer()
df['sentimentv'] = df['review'].progress_apply(lambda x: analyser.polarity_scores(x)['compound'])

In [None]:
torch.cuda.is_available()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
sentiment_analysis = pipeline("sentiment-analysis", device=0)

In [None]:
#Standar BERT crash with long text, so the best you can do is use Try and Except to avoid this or use LONGBERT
df['sentimentT'] = ''

for i in range(len(df)):
    try:
        result = sentiment_analysis(df['review'][i])[0]
        if result['label'] == 'NEGATIVE':
            df['sentimentT'][i] = result['score']*-1
        else:
            df['sentimentT'][i] = result['score']
    except:
        df['sentimentT'][i] = 0.01
    
    print(i)

In [None]:
df['sentimentBlob'] = round(df['sentimentBlob'],2)
df['sentimentv'] = round(df['sentimentv'],2)
df['sentimentT'] = round(df['sentimentT'],2)

# Compare the results with score reviews

In [None]:
fig = px.imshow(round(df[['score', 'sentimentBlob', 'sentimentv', 'sentimentT']].corr(),2), text_auto=True,
                labels=dict(color='Correlation'), x=['Score', 'TextBlob', 'Vader', 'BERT'], y=['Score', 'TextBlob', 'Vader', 'BERT'],
                title='Correlation between sentiment analysis libraries vs score reviews')
fig.show()

# BERT get the highest correlation value

In [None]:
dfs = df.groupby('date', dropna=True, as_index=False).mean()
dfs['mean'] = dfs['sentimentT'].rolling(500,min_periods=50).mean()
dfs['mean2'] = dfs['sentimentT'].rolling(4,min_periods=1).mean()

fig = go.Figure()

fig.add_hrect(y0=0, y1=1, fillcolor='blue', opacity=0.2)
fig.add_hrect(y0=0, y1=-1, fillcolor='red', opacity=0.2)
fig.add_trace(go.Scatter(x=dfs['date'], y=dfs['mean2'], name='Sentiment activity', line_color='gray'))
fig.add_trace(go.Line(x=dfs['date'], y=dfs['mean'], name='Rolling mean', line_color='yellow', line_width=4))
fig.add_hline(y=0, line_width=2, line_dash='dash', line_color='white')

fig.update_layout(title='Sentiment activity Tinder app reviews in store 2013-2021, BERT')
fig.show()

In [None]:
dfy = df.groupby('year', as_index=False).size()
dfy['y'] = 0

fig = px.scatter(dfy, x='year', y='y', size='size', size_max=100)
fig.show()

In [None]:
dfsc = df.groupby('versionshort', dropna=True, as_index=False).mean()

fig = go.Figure()

fig.add_trace(go.Scatter(x=dfsc['versionshort'], y=dfsc['sentimentT'], mode='lines+markers', line_color='gray', line_width=4))
fig.add_hrect(y0=0, y1=1, fillcolor='blue', opacity=0.2)
fig.add_hrect(y0=0, y1=-1, fillcolor='red', opacity=0.2)
fig.add_hline(y=0, line_width=2, line_dash='dash', line_color='white')

fig.update_xaxes(type='category')
fig.update_yaxes(range=[-0.5, 0.5])
fig.update_layout(showlegend = False, title='Sentiment by app version')
fig.show()

# NGRAMS analysis

In [None]:
df['review'] = df['review'].str.lower() 
df['review'] = df['review'].str.replace('\d+', '')  
df['review'] = df['review'] .str.normalize('NFKD').str.encode('ascii',errors='ignore').str.decode('utf-8') 
df['review'] = df['review'].str.replace('\W', ' ')

In [None]:
stops = stopwords.words('english')
stops = stops + ['it', 'tinder', 'i', 'app', 'a', 'to', 'with']

df['reviewstop'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stops)]))

In [None]:
stemmer = SnowballStemmer('english')
tqdm.pandas()

df['reviewstem'] = df['reviewstop'].str.split() #Separa por palabra 
df['reviewstem'] = df['reviewstem'].progress_apply(lambda x: [stemmer.stem(y) for y in x]) #Aplica el steam
df['reviewstem'] = df['reviewstem'].apply(lambda x: ' '.join(x)) #Aplana 

In [None]:
most = Counter(" ".join(df['reviewstem']).split()).most_common()
dfm = pd.DataFrame(most, columns={'count', 'word'})

fig = px.bar(dfm.head(30), x='count', y='word')
fig.show()

In [None]:
df['reviewtoken'] = df.apply(lambda row: nltk.word_tokenize(row['reviewstem']), axis=1)

In [None]:
ngram=4
wordsf = pd.DataFrame()

listwords = df['reviewtoken'].to_list()
listwords = [item for sublist in listwords for item in sublist]

for i in range(ngram):
    listwordst = [pd.Series(nltk.ngrams(listwords, i+1)).value_counts().index[0:5], pd.Series(nltk.ngrams(listwords, i+1)).value_counts().values[0:5]]
    dfw = pd.DataFrame(listwordst).T
    dfw['ngram'] = i+1
    dfw.columns=['word', 'count', 'ngram']
    title = '\n Ngram: ' + str(i+1) + '\n'
    wordsf = wordsf.append(dfw)
    print(title, dfw)

In [None]:
grams = wordsf['word'].str.join(' ',).to_list()
levels = wordsf['ngram'].to_list()
ngrams = [3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6]

for i, x, y in zip(grams, levels, ngrams):
    tokens = []
    print(i)
    dft = df.reviewstem.str.extract(fr'(\w+ \w+ \w+ \w+ \w+ \w+ {i} \w+ \w+ \w+ \w+ \w+ \w+)', expand=True).dropna()
    tokens = dft.apply(lambda row: nltk.word_tokenize(row[0]), axis=1)
    tokens = [item for sublist in tokens for item in sublist]
    print(pd.Series(nltk.ngrams(tokens, y)).value_counts()[0:5])
