<a href="https://colab.research.google.com/github/jillianhaig/Project1_DS4002/blob/main/SCRIPTS/1DatasetCreationAndCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This file creates our inital dataset by joining all of the individual movie datasets
# Also, this file creates the UniqueID variable that distinguishes which movie has each review
# It optionally downloads the data as DisneyMoviesDataset.csv
# The below code also assigns each movie to a release date, defines if a movie is considered a recent release or not,
# and conducts analysis using the VADER package

In [None]:
! git clone https://github.com/jillianhaig/Project1_DS4002 # so we can access data loaded from shared github


Cloning into 'Project1_DS4002'...
remote: Enumerating objects: 553, done.[K
remote: Counting objects: 100% (164/164), done.[K
remote: Compressing objects: 100% (152/152), done.[K
remote: Total 553 (delta 102), reused 12 (delta 12), pack-reused 389 (from 1)[K
Receiving objects: 100% (553/553), 14.46 MiB | 6.65 MiB/s, done.
Resolving deltas: 100% (244/244), done.


In [None]:
import pandas as pd
import os
from datetime import datetime, timedelta

In [None]:
folder_path = '/content/Project1_DS4002/Data/Individual Movie Data'

# Gathers all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Empty Dataframe
dataframes = []

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    # Adds a new column with the filename
    df['UniqueID'] = os.path.splitext(file)[0]
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all dataframes
merged_df = pd.concat(dataframes, ignore_index=True)

merged_df.to_csv('DisneyMoviesDataset.csv', index=False)

# Optional code to download file
# from google.colab import files
# files.download('DisneyMoviesDataset.csv')

In [None]:
disney_df = pd.read_csv("/content/Project1_DS4002/Data/Final Datasets/DisneyMoviesDataset.csv")

In [None]:
# Define a dictionary mapping movie titles to their release dates
movie_release_dates = {
    'Big Hero 6': '2014-11-07',
    'Monsters University': '2013-06-21',
    'Lilo & Stitch': '2002-06-21',
    'Finding Dory': '2016-06-17',
    'Bolt': '2008-11-21',
    'WALL·E': '2008-06-27',
    'Zootopia': '2016-03-17',
    'Cars 2': '2011-06-24',
    'Finding Nemo': '2003-05-30',
    'Monsters, Inc.': '2001-11-02',
    'Wreck-It Ralph': '2012-11-02',
    'Inside Out': '2015-06-19',
    'Brave': '2012-06-22',
    'The Incredibles': '2004-11-05',
    'Moana': '2016-11-23',
    'The Emperor\'s New Groove': '2000-12-15',
    'Tangled': '2010-11-24',
    'Toy Story 3': '2010-06-18',
    'Up': '2009-05-29',
    'Frozen': '2013-11-27',
    'Ratatouille': '2007-06-29',
    'Cars': '2006-06-09',
    'Toy Story 4': '2019-06-21'
}

# Map UniqueID to release_date
def get_release_date(unique_id):
    # Iterate over the movie_release_dates dictionary
    for movie, release_date in movie_release_dates.items():
        if movie in unique_id:
            return release_date
    return None

# ***NOTE*** We had to manually the release date for 'Cars' within the dataset. This is because the naming conventions of Cars 2 and Cars.
# Simply using the FinalDisneyDataset.csv under the Final Datasets folder when conducting your analysis

# Create the new column 'release_date'
disney_df['release_date'] = disney_df['UniqueID'].apply(get_release_date)

In [None]:
def convert_date(date_str):
    try:
        # Convert the original date string to a datetime object
        date_object = datetime.strptime(date_str, "%d %B %Y")
        return date_object.strftime("%Y-%m-%d")
    except:
        return date_str

# Apply the conversion function to the 'date' column
disney_df['date'] = disney_df['date'].apply(convert_date)

# Convert columns to datetime format
disney_df['date'] = pd.to_datetime(disney_df['date'])
disney_df['release_date'] = pd.to_datetime(disney_df['release_date'])

# Define the function to determine if 'date' is within one year of 'release_date'
def is_recent(row):
    start_date = row['release_date'] - timedelta(days=365)
    end_date = row['release_date'] + timedelta(days=365)
    return 1 if start_date <= row['date'] <= end_date else 0

# Apply to disney_df
disney_df['recent?'] = disney_df.apply(is_recent, axis=1)

# Optional code to download dataset
#disney_df.to_csv('CleanedDisneyMoviesDataset.csv', index=False)
#from google.colab import files
#files.download('CleanedDisneyMoviesDataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
# Function to get sentiment scores
def get_sentiment_scores(review):
    score = analyzer.polarity_scores(review)
    return pd.Series([score['neg'], score['pos'], score['compound']])

In [None]:
# Apply the function to the 'review' column
disney_df[['negative', 'positive', 'compound']] = disney_df['review'].apply(get_sentiment_scores)

In [None]:
disney_df = disney_df.drop(["username"], axis=1)

In [None]:
# Optional Code to Download the dataset

#disney_df.to_csv('FinalDisneyDataset.csv', index=False)
#from google.colab import files
#files.download('FinalDisneyDataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>