In [44]:
import pandas as pd
import re
from urllib.parse import urlparse

df_user_reviews = pd.read_csv('UserReviews.txt', delimiter='\t', encoding='ISO-8859-1')
df_meta = pd.read_csv('metaClean43Brightspace.txt', delimiter='\t', encoding='ISO-8859-1')
df_MovieID = pd.read_csv('NewMovieTable.csv', encoding='ISO-8859-1')

df_reviews_only = df_user_reviews[df_user_reviews['Rev'].notna() & (df_user_reviews['Rev'].str.strip() != '')]
df_reviews_only = df_reviews_only.drop_duplicates()

columns_to_drop = [
    "WC", "Analytic", "Clout", "Authentic", "Tone", "WPS", "Sixltr", "Dic", 
    "function", "pronoun", "ppron", "i", "we", "you", "shehe", "they", "ipron", 
    "article", "prep", "auxverb", "adverb", "conj", "negate", "verb", "adj", 
    "compare", "interrog", "number", "quant", "affect", 
    "anx", "anger", "sad", "social", "family", "friend", "female", "male", 
    "cogproc", "insight", "cause", "discrep", "tentat", "certain", "differ", 
    "percept", "see", "hear", "feel", "bio", "body", "health", "sexual", 
    "ingest", "drives", "affiliation", "achieve", "power", "reward", "risk", 
    "focuspast", "focuspresent", "focusfuture", "relativ", "motion", "space", 
    "time", "work", "leisure", "home", "money", "relig", "death", "informal", 
    "swear", "netspeak", "assent", "nonflu", "filler", "AllPunc", "Period", 
    "Comma", "Colon", "SemiC", "QMark", "Exclam", "Dash", "Quote", "Apostro", 
    "Parenth", "OtherP", "dateP"
]
df_reviews_only = df_reviews_only.drop(columns=columns_to_drop, errors='ignore')

df_reviews_only.rename(columns={'thumbstot': 'totalthumbs'}, inplace=True)
df_reviews_only['reviewer'] = df_reviews_only['reviewer'].str.replace("'", "").str.replace(".", "").str.lower()


def extract_title_from_url(url):
    path = urlparse(url).path
    title = path.split('/')[-1]
    return title.replace('-', ' ').lower().strip()

df_reviews_only['extracted_title'] = df_reviews_only['url'].apply(extract_title_from_url)


def clean_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_reviews_only['normalized_title'] = df_reviews_only['extracted_title'].apply(clean_text)
df_MovieID['normalized_title'] = df_MovieID['TITLE'].apply(clean_text)


df_merged = pd.merge(df_reviews_only, df_MovieID[['normalized_title', 'Movie_ID']], on='normalized_title', how='left')


unmatched_reviews = df_merged[df_merged['Movie_ID'].isna()]
if not unmatched_reviews.empty:
    print("Unmatched user reviews:")
    print(unmatched_reviews[['url', 'normalized_title']])


df_merged['review_id'] = range(1, len(df_merged) + 1)

# Handling numeric conversions (apply this to df_merged)
df_merged['posemo'] = df_merged['posemo'].astype(str).str.replace(',', '.').astype(float)
df_merged['negemo'] = df_merged['negemo'].astype(str).str.replace(',', '.').astype(float)


df_merged.to_csv('/Users/laiba/Documents/GitHub/gbm/cleaned_user_reviews.csv', index=False)

print("After merge, columns are:", df_merged.columns)
print(df_merged.head())


  df_user_reviews = pd.read_csv('UserReviews.txt', delimiter='\t', encoding='ISO-8859-1')


After merge, columns are: Index(['url', 'idvscore', 'reviewer', 'Rev', 'thumbsUp', 'thumbsTot', 'posemo',
       'negemo', 'extracted_title', 'normalized_title', 'Movie_ID',
       'review_id'],
      dtype='object')
                                        url idvscore       reviewer  \
0  https://www.metacritic.com/movie/bronson      8.0   longbottom94   
1  https://www.metacritic.com/movie/bronson      9.0        martinb   
2  https://www.metacritic.com/movie/bronson     10.0         jaakko   
3  https://www.metacritic.com/movie/bronson      6.0          capor   
4  https://www.metacritic.com/movie/bronson      8.0        orwellb   

                                                 Rev thumbsUp thumbsTot  \
0   'Many have dismissed this film for not explor...      2.0       2.0   
1   'Anyone who doesn t like this movie simply ju...      0.0       1.0   
2   'Not sure what to think at this film at first...      1.0       1.0   
3   'Nicely portrayed but it lacks the elements t...    

In [31]:
print(df_reviews_only.columns)


Index(['url', 'idvscore', 'reviewer', 'Rev', 'thumbsUp', 'thumbsTot', 'posemo',
       'negemo', 'extracted_title', 'normalized_title', 'review_id',
       'Movie_ID'],
      dtype='object')


In [28]:
print(df_reviews_only[['thumbsTot', 'thumbsUp']].isna().sum())


thumbsTot    0
thumbsUp     0
dtype: int64


In [32]:
max_bigint = 9223372036854775807


df_reviews_only.rename(columns={'thumbsTot': 'totalthumbs', 'thumbsUp': 'thumbsup'}, inplace=True)


df_reviews_only['totalthumbs'] = pd.to_numeric(df_reviews_only['totalthumbs'], errors='coerce')
df_reviews_only['thumbsup'] = pd.to_numeric(df_reviews_only['thumbsup'], errors='coerce')

# Apply modulo to ensure values stay within the bigint range
df_reviews_only['totalthumbs'] = df_reviews_only['totalthumbs'].apply(lambda x: x % max_bigint if pd.notna(x) else x)
df_reviews_only['thumbsup'] = df_reviews_only['thumbsup'].apply(lambda x: x % max_bigint if pd.notna(x) else x)
df_reviews_only['review_id'] = df_reviews_only['review_id'].apply(lambda x: x % max_bigint if pd.notna(x) else x)
df_reviews_only['Movie_ID'] = df_reviews_only['Movie_ID'].apply(lambda x: x % max_bigint if pd.notna(x) else x)


print("Max Movie_ID:", df_reviews_only['Movie_ID'].max())
print("Max review_id:", df_reviews_only['review_id'].max())
print("Max totalthumbs:", df_reviews_only['totalthumbs'].max())
print("Max thumbsup:", df_reviews_only['thumbsup'].max())


Max Movie_ID: 34421
Max review_id: 316125
Max totalthumbs: 1422.0
Max thumbsup: 562.0


In [50]:
import psycopg2

conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="Laiba786",
    port="5432"
)
cursor = conn.cursor()

create_movie_table_query = """
CREATE TABLE IF NOT EXISTS movie_table (
    Movie_ID SERIAL PRIMARY KEY,
    Title TEXT UNIQUE,
    WorldwideBoxOffice FLOAT
);
"""
create_user_reviews_query = """
CREATE TABLE IF NOT EXISTS user_reviews (
    review_id SERIAL PRIMARY KEY,
    reviewscore FLOAT,
    posemo FLOAT,
    negemo FLOAT,
    totalthumbs INTEGER,
    thumbsup INTEGER,
    rev TEXT,
    url TEXT,
    movie_id BIGINT REFERENCES movie_table (Movie_ID)
);
"""

cursor.execute(create_movie_table_query)
cursor.execute(create_user_reviews_query)
conn.commit()

insert_movie_query = """
INSERT INTO movie_table (Title, WorldwideBoxOffice)
VALUES (%s, %s)
RETURNING Movie_ID;
"""
insert_user_reviews_query = """
INSERT INTO user_reviews (review_id, movie_id, reviewscore, posemo, negemo, totalthumbs, thumbsup, rev, url)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (review_id) DO NOTHING;
"""

df_merged['thumbsUp'] = pd.to_numeric(df_merged['thumbsUp'], errors='coerce')
df_merged['thumbsTot'] = pd.to_numeric(df_merged['thumbsTot'], errors='coerce')
df_merged['posemo'] = pd.to_numeric(df_merged['posemo'], errors='coerce')
df_merged['negemo'] = pd.to_numeric(df_merged['negemo'], errors='coerce')
df_merged['idvscore'] = pd.to_numeric(df_merged['idvscore'], errors='coerce')

import math

for index, row in df_merged.iterrows():
    try:
        cursor.execute("SELECT Movie_ID FROM movie_table WHERE Title = %s", (row['extracted_title'],))
        existing_movie = cursor.fetchone()

        if existing_movie:
            movie_id = existing_movie[0]
        else:
            cursor.execute(insert_movie_query, (row['extracted_title'], None))
            movie_id = cursor.fetchone()[0]

        # Replace NaN with 0 for thumbsUp and thumbsTot
        thumbsup = row['thumbsUp'] if not math.isnan(row['thumbsUp']) else 0
        thumbstot = row['thumbsTot'] if not math.isnan(row['thumbsTot']) else 0

        # Ensure values are within the BIGINT range
        thumbsup = min(max(0, thumbsup), 9223372036854775807)
        thumbstot = min(max(0, thumbstot), 9223372036854775807)

        print(f"Inserting review_id: {row['review_id']}, movie_id: {movie_id}, thumbsTot: {thumbstot}, thumbsUp: {thumbsup}")
        
        cursor.execute(insert_user_reviews_query, (
            row['review_id'], movie_id, row['idvscore'], row['posemo'], row['negemo'],
            thumbstot, thumbsup, row['Rev'], row['url']
        ))
        
    except Exception as e:
        print(f"Error on row {index}: {e}")
        break

Inserting review_id: 1, movie_id: 20029, thumbsTot: 2.0, thumbsUp: 2.0
Inserting review_id: 2, movie_id: 20029, thumbsTot: 1.0, thumbsUp: 0
Inserting review_id: 3, movie_id: 20029, thumbsTot: 1.0, thumbsUp: 1.0
Inserting review_id: 4, movie_id: 20029, thumbsTot: 1.0, thumbsUp: 0
Inserting review_id: 5, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 6, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 7, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 8, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 9, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 10, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 11, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 12, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 13, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 14, movie_id: 20029, thumbsTot: 0, thumbsUp: 0
Inserting review_id: 15, movie_id: 20029, thu

KeyboardInterrupt: 