In [35]:
import pandas as pd
import re
from urllib.parse import urlparse

df_user_reviews = pd.read_csv('UserReviews.txt', delimiter='\t', encoding='ISO-8859-1')
df_meta = pd.read_csv('metaClean43Brightspace.txt', delimiter='\t', encoding='ISO-8859-1')
df_MovieID = pd.read_csv('NewMovieTable.csv', encoding='ISO-8859-1')

df_reviews_only = df_user_reviews[df_user_reviews['Rev'].notna() & (df_user_reviews['Rev'].str.strip() != '')]


df_reviews_only.drop_duplicates(inplace=True)


columns_to_drop = [
    "WC", "Analytic", "Clout", "Authentic", "Tone", "WPS", "Sixltr", "Dic", 
    "function", "pronoun", "ppron", "i", "we", "you", "shehe", "they", "ipron", 
    "article", "prep", "auxverb", "adverb", "conj", "negate", "verb", "adj", 
    "compare", "interrog", "number", "quant", "affect", 
    "anx", "anger", "sad", "social", "family", "friend", "female", "male", 
    "cogproc", "insight", "cause", "discrep", "tentat", "certain", "differ", 
    "percept", "see", "hear", "feel", "bio", "body", "health", "sexual", 
    "ingest", "drives", "affiliation", "achieve", "power", "reward", "risk", 
    "focuspast", "focuspresent", "focusfuture", "relativ", "motion", "space", 
    "time", "work", "leisure", "home", "money", "relig", "death", "informal", 
    "swear", "netspeak", "assent", "nonflu", "filler", "AllPunc", "Period", 
    "Comma", "Colon", "SemiC", "QMark", "Exclam", "Dash", "Quote", "Apostro", 
    "Parenth", "OtherP", "dateP"
]
df_reviews_only = df_reviews_only.drop(columns=columns_to_drop, errors='ignore')


df_reviews_only['reviewer'] = df_reviews_only['reviewer'].str.replace("'", "").str.replace(".", "").str.lower()


def extract_title_from_url(url):
    path = urlparse(url).path
    title = path.split('/')[-1]
    return title.replace('-', ' ').lower().strip()

df_reviews_only['extracted_title'] = df_reviews_only['url'].apply(extract_title_from_url)


def clean_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_reviews_only['normalized_title'] = df_reviews_only['extracted_title'].apply(clean_text)
df_MovieID['normalized_title'] = df_MovieID['TITLE'].apply(clean_text)


df_reviews_only['review_id'] = range(1, len(df_reviews_only) + 1)


df_reviews_only = df_reviews_only.merge(df_MovieID[['normalized_title', 'Movie_ID']], on='normalized_title', how='left')

print("After merge, columns are:", df_reviews_only.columns)
print(df_reviews_only.head())  # Check if 'ID' exists


if 'ID' in df_reviews_only.columns:
    df_reviews_only.rename(columns={'ID': 'Movie_ID'}, inplace=True)

df_reviews_only['posemo'] = df_reviews_only['posemo'].astype(str).str.replace(',', '.').astype(float)
df_reviews_only['negemo'] = df_reviews_only['negemo'].astype(str).str.replace(',', '.').astype(float)


unmatched_reviews = df_reviews_only[df_reviews_only['Movie_ID'].isna()]
if not unmatched_reviews.empty:
    print("Unmatched user reviews:")
    print(unmatched_reviews[['url', 'normalized_title']])


df_reviews_only.to_csv('/Users/laiba/Documents/GitHub/gbm/cleaned_user_reviews.csv', index=False)


print(df_reviews_only.head())


  df_user_reviews = pd.read_csv('UserReviews.txt', delimiter='\t', encoding='ISO-8859-1')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_only.drop_duplicates(inplace=True)


After merge, columns are: Index(['url', 'idvscore', 'reviewer', 'Rev', 'thumbsUp', 'thumbsTot', 'posemo',
       'negemo', 'extracted_title', 'normalized_title', 'review_id',
       'Movie_ID'],
      dtype='object')
                                        url idvscore       reviewer  \
0  https://www.metacritic.com/movie/bronson      8.0   longbottom94   
1  https://www.metacritic.com/movie/bronson      9.0        martinb   
2  https://www.metacritic.com/movie/bronson     10.0         jaakko   
3  https://www.metacritic.com/movie/bronson      6.0          capor   
4  https://www.metacritic.com/movie/bronson      8.0        orwellb   

                                                 Rev thumbsUp thumbsTot  \
0   'Many have dismissed this film for not explor...      2.0       2.0   
1   'Anyone who doesn t like this movie simply ju...      0.0       1.0   
2   'Not sure what to think at this film at first...      1.0       1.0   
3   'Nicely portrayed but it lacks the elements t...    

In [69]:
print("Columns in df_reviews_only:", df_reviews_only.columns)


Columns in df_reviews_only: Index(['url', 'idvscore', 'reviewer', 'Rev', 'thumbsUp', 'thumbsTot', 'posemo',
       'negemo', 'extracted_title', 'normalized_title', 'review_id',
       'Movie_ID'],
      dtype='object')


In [70]:
import psycopg2

# Establish connection to PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="Laiba786",
    port="5432"
)

cursor = conn.cursor()

# Modify the table to use BIGINT for potentially large columns
cursor.execute("""
    ALTER TABLE user_reviews
    ALTER COLUMN Movie_ID TYPE BIGINT,
    ALTER COLUMN TotalThumbs TYPE BIGINT,
    ALTER COLUMN ThumbsUp TYPE BIGINT;
""")
conn.commit()

# Now, retry the data insert
insert_user_reviews_query = """
INSERT INTO user_reviews (Movie_ID, Url, Posemo, Negemo, ReviewScore, TotalThumbs, ThumbsUp, Rev)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (Review_ID) DO NOTHING;
"""

# Check for any extremely large values before inserting (optional step)
print(df_reviews_only[['Movie_ID', 'thumbsTot', 'thumbsUp']].max())

# Iterate through the dataframe and insert data into the database
for index, row in df_reviews_only.iterrows():
    cursor.execute(insert_user_reviews_query, (
        row['Movie_ID'],          # Movie ID
        row['url'],               # Review URL
        row['posemo'],            # Positive emotion score
        row['negemo'],            # Negative emotion score
        row['idvscore'],          # Review score (mapped from idvscore)
        row['thumbsTot'],         # Total thumbs (mapped from thumbsTot)
        row['thumbsUp'],          # Thumbs up count (mapped from thumbsUp)
        row['Rev']                # Review text (mapped from Rev)
    ))

# Commit changes
conn.commit()

# Close the cursor and connection
cursor.close()
conn.close()

print("Data inserted successfully!")


NumericValueOutOfRange: integer out of range


In [63]:
import psycopg2

conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="Laiba786", 
    port="5432"
)

cursor = conn.cursor()

create_user_reviews_query = """
CREATE TABLE IF NOT EXISTS user_reviews (
    Review_ID SERIAL PRIMARY KEY,
    Movie_ID INTEGER NOT NULL,
    Url TEXT NOT NULL,
    Posemo FLOAT,
    Negemo FLOAT,
    ReviewScore FLOAT,  
    ThumbsUp INTEGER,
    TotalThumbs INTEGER,  
    ReviewText TEXT, 
    FOREIGN KEY (Movie_ID) REFERENCES movie_table (Movie_ID)
);
"""
cursor.execute(create_user_reviews_query)
conn.commit()

insert_user_reviews_query = """
INSERT INTO user_reviews (Movie_ID, Url, Posemo, Negemo, ReviewScore, TotalThumbs, ThumbsUp, ReviewText)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (Review_ID) DO NOTHING;
"""

for index, row in df_reviews_only.iterrows():
    cursor.execute(insert_user_reviews_query, (
        row['Movie_ID'],          # Movie ID
        row['url'],               # Review URL
        row['posemo'],            # Positive emotion score
        row['negemo'],            # Negative emotion score
        row['idvscore'],          # Review score
        row['thumbsTot'],         # Total thumbs (use thumbsTot instead of TotalThumbs)
        row['thumbsUp'],          # Thumbs up count (use thumbsUp instead of thumbsup)
        row['Rev']                # Review text (use Rev instead of ReviewText)
    ))

conn.commit()

cursor.close()
conn.close()


UndefinedColumn: column "reviewscore" of relation "user_reviews" does not exist
LINE 2: ...INTO user_reviews (Movie_ID, Url, Posemo, Negemo, ReviewScor...
                                                             ^


In [57]:
import psycopg2


conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="Laiba786",
    port="5432"
)

cursor = conn.cursor()

create_movie_user_reviews_query = """
CREATE TABLE IF NOT EXISTS movie_user_reviews (
    Movie_ID INTEGER,
    Review_ID INTEGER,
    PRIMARY KEY (Movie_ID, Review_ID),
    FOREIGN KEY (Movie_ID) REFERENCES movie_table (Movie_ID),
    FOREIGN KEY (Review_ID) REFERENCES user_reviews (Review_ID)
);
"""
cursor.execute(create_movie_user_reviews_query)
conn.commit()

# Step 3: Prepare insert queries
insert_movie_query = """
INSERT INTO movie_table (Movie_ID, Title, WorldwideBoxOffice)
VALUES (%s, %s, %s)
ON CONFLICT (Movie_ID) DO NOTHING;
"""

insert_review_query = """
INSERT INTO user_reviews (Review_ID, Movie_ID, Url, Posemo, Negemo)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (Review_ID) DO NOTHING;
"""

insert_movie_user_reviews_query = """
INSERT INTO movie_user_reviews (Movie_ID, Review_ID)
VALUES (%s, %s)
ON CONFLICT DO NOTHING;
"""

# Step 4: Iterate through the reviews dataframe
duplicate_title_movies = []  # To log Movie_IDs with duplicate titles

for index, row in df_reviews_only.iterrows():
    # Step 4a: Try to insert into movie_table
    try:
        cursor.execute(insert_movie_query, (row['Movie_ID'], row['extracted_title'], None))  # Insert movie
    except psycopg2.errors.UniqueViolation:
        conn.rollback()  # Rollback if a unique constraint is violated
        duplicate_title_movies.append((row['Movie_ID'], row['extracted_title']))
        continue  # Skip to the next row if there's a violation

    # Step 4b: Insert the review into user_reviews
    cursor.execute(insert_review_query, (
        row['review_id'], row['Movie_ID'], row['url'], row['posemo'], row['negemo']
    ))

    # Step 4c: Insert the relationship into movie_user_reviews
    cursor.execute(insert_movie_user_reviews_query, (row['Movie_ID'], row['review_id']))

# Step 5: Commit all changes to the database
conn.commit()

# Step 6: Print Movie_IDs with duplicate titles (for debugging purposes)
if duplicate_title_movies:
    print("Duplicate titles detected for the following Movie_IDs:")
    print(duplicate_title_movies)

# Step 7: Close the cursor and connection after all operations
cursor.close()
conn.close()


Duplicate titles detected for the following Movie_IDs:
[(18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am number four'), (18799, 'i am nu