<a href="https://colab.research.google.com/github/kyrajeep/DL_Projects/blob/master/netflix_common_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Based on a Kaggle dataset Netflix Reviews and a notebook by sara_metawea
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'netflix-reviews-playstore-daily-updated:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4981370%2F8534746%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240528%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240528T203638Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1326f469e96f4747b4229faf15bb28b37611c33ff8e520c9d171892fbef5b7cb4e7a9c4b82facadfd70dfe9d09e332bae17c219d98ce054711937de7b860ce20d7a480b5639d61d87e845358c58c96a88cecb839e26c12c7d10c8d31184370c5fc6e440c91770108ae92245850e25d4dd65fd9f20eca5f8e1fff4daa353f7eb642bf87e0f90aa4bd74943f788283209e5e25de56c68d528e3c767d7e43d3ab80bb9b371a574591b56c26fdf664cb26bc405b37758b0b5e6d2e0409981a45d123535d971ab2f9f17cbd5fd61ab5a11db2e7f8f342a97221be53dd363923f313b5c2c2a432c7c26a963e6994010a6df5974858c736b84368c2aadcac81c306687c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from collections import Counter
import string

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/netflix-reviews-playstore-daily-updated/netflix_reviews.csv")

In [None]:
data.head(2)

In [None]:
red_palette = sns.color_palette("Reds", 10)

In [None]:
plt.figure(figsize = (10, 6))
sns.histplot(data['score'],bins = 5, kde = True, color = red_palette[-1])
plt.title('Distrbribution of Review score', fontsize=16)
plt.xlabel('score', fontsize=14)
plt.ylabel('frequency', fontsize=14)
plt.show()

In [None]:
plt.figure(figsize= (10, 6))
sns.boxplot(x=data['thumbsUpCount'], color=red_palette[0])
plt.title('Boxplot of Thumbs Up Counts', fontsize=16)
plt.xlabel('Thumbs Up Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(data['score'], shade=True, color=red_palette[2])
plt.title('Kernel Density Estimation of Scores', fontsize=16)
plt.xlabel('Score', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x=data['score'], color=red_palette[4])
plt.title('Violin Plot of Review Scores', fontsize=16)
plt.xlabel('Scores', fontsize=14)
plt.show()

In [None]:
score_counts = data['score'].value_counts().sort_index()
plt.figure(figsize=(8, 8))
plt.pie(score_counts, labels=score_counts.index, autopct='%1.2f%%', colors=red_palette)
plt.title('Distribution of Review Scores', fontsize=16)
plt.show()

Has anyone written a review over a period of time? If there are multiple users that wrote before and after a significant update or business decisions, that can help in making future decisions.

In [None]:
data.nunique()

There are users that left more than one review.

In [None]:
data.describe()

In [None]:
#data = data.drop_duplicates()
#data.nunique()
data.columns
data['content'] = data['content'].fillna('')


In [None]:
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Split into words
    words = text.split()
    return words

In [None]:
# Apply preprocessing to each review
all_words = []
for review in data['content']:
    words = preprocess(review)
    all_words.extend(words)

    # 3. Count the words
word_counts = Counter(all_words)

# Get the most common words
most_common_words = word_counts.most_common()

# Print the most common words
print(most_common_words)

In [None]:
# Now decide on what to do with this information.