In [19]:
import pandas as pd
from urllib.parse import urlparse
import re

In [20]:
def extract_title_from_url(url):
    path = urlparse(url).path
    title = path.split('/')[-1]
    return title.replace('-', ' ').title()

df_expert = pd.read_csv('ExpertReviewsClean43LIWC.txt', delimiter='\t', encoding='ISO-8859-1')
df_sales = pd.read_csv('SalesData_with_ids (1).csv', sep=None, engine='python', encoding='ISO-8859-1')

df_expert['title'] = df_expert['url'].apply(extract_title_from_url).str.lower()


In [21]:
df_expert['ExpertReview_ID'] = range(1, len(df_expert) + 1)

In [22]:
import re 

number_map = {
    "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
    "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9",
    "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14",
    "fifteen": "15", "sixteen": "16", "seventeen": "17", "eighteen": "18", "nineteen": "19",
    "twenty": "20"
}

def clean_title(title):
    title = re.sub(r'\(\d{4}\)', '', title)
    title = re.sub(r'[^\w\s]', '', title)
    return title

def move_articles_to_end(title):
    words = title.split()
    if words[0].lower() in ["the", "a"]:
        words = words[1:] + [words[0]]
    return ' '.join(words)

def roman_to_arabic(roman):
    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    arabic = 0
    prev_value = 0
    for char in roman[::-1]:  
        value = roman_numerals.get(char, 0)
        if value >= prev_value:
            arabic += value
        else:
            arabic -= value
        prev_value = value
    return arabic

def convert_roman_to_arabic(title):
    words = title.split()
    converted_words = []
    for word in words:
        try:
            if re.match(r'^[IVXLCDM]+$', word.upper()):  
                converted_words.append(str(roman_to_arabic(word.upper())))
            else:
                converted_words.append(word)
        except (ValueError, KeyError):
            converted_words.append(word)
    return ' '.join(converted_words)


def convert_text_numbers_to_digits(title):
    words = title.split()
    converted_words = [number_map.get(word, word) for word in words] 
    return ' '.join(converted_words)

def normalize_title(title):
    title = title.lower()  
    title = clean_title(title)  
    title = move_articles_to_end(title) 
    title = convert_roman_to_arabic(title)  
    title = convert_text_numbers_to_digits(title) 
    return title

df_expert['normalized_title'] = df_expert['title'].apply(normalize_title)

df_sales['normalized_title'] = df_sales['title'].apply(normalize_title)

df_expert = df_expert.merge(df_sales[['normalized_title', 'Movie_ID']], 
                                          on='normalized_title', how='left')

In [23]:
print(df_expert.head())

                                        url  idvscore            reviewer  \
0  https://www.metacritic.com/movie/bronson     100.0    "Andrew O'Hehir"   
1  https://www.metacritic.com/movie/bronson      90.0        'A.O. Scott'   
2  https://www.metacritic.com/movie/bronson      90.0                None   
3  https://www.metacritic.com/movie/bronson      83.0       'Noel Murray'   
4  https://www.metacritic.com/movie/bronson      80.0   'Joshua Rothkopf'   

   dateP                                                Rev  WC  Analytic  \
0   None   'Bronson owes a little or a lot to Kubrick s ...  25     73.88   
1   None   'Bronson invites you to admire its protagonis...  30     13.07   
2   None   'Whether it s Peterson/Bronson s more theatri...  40     72.69   
3   None   'There are two Bronsons on display here: the ...  39     65.46   
4   None   'Refn has somehow found his way to an authent...  24     88.46   

   Clout  Authentic   Tone  ...  Exclam   Dash  Quote  Apostro  Parenth  \

In [24]:
movie_table = df_sales[['Movie_ID', 'title', 'runtime', 'theatre_count']]
movie_expert_review = df_expert[['Movie_ID', 'ExpertReview_ID']]
expert_review_table = df_expert[['ExpertReview_ID', 'url', 'posemo', 'negemo']]

In [25]:
print(movie_table.head())
print(movie_expert_review.head())
print(expert_review_table.head())

   Movie_ID                   title  runtime  theatre_count
0         1  bakha satang (s korea)    129.0            NaN
1         2               antitrust      NaN         2433.0
2         3                santitos    105.0            NaN
3         4      frank mcklusky c i      NaN            NaN
4         5      walk to remember a      NaN         2411.0
   Movie_ID  ExpertReview_ID
0   19702.0                1
1   19702.0                2
2   19702.0                3
3   19702.0                4
4   19702.0                5
   ExpertReview_ID                                       url  posemo  negemo
0                1  https://www.metacritic.com/movie/bronson    0.00    0.00
1                2  https://www.metacritic.com/movie/bronson    6.67    0.00
2                3  https://www.metacritic.com/movie/bronson    0.00    2.50
3                4  https://www.metacritic.com/movie/bronson    5.13    5.13
4                5  https://www.metacritic.com/movie/bronson    0.00    0.00


In [26]:
movie_table.to_csv('Movie_Table.csv', index=False, sep='\t', encoding='ISO-8859-1')
movie_expert_review.to_csv('Movie_ExpertReview.csv', index=False, sep='\t', encoding='ISO-8859-1')
expert_review_table.to_csv('ExpertReview_Table.csv', index=False, sep='\t', encoding='ISO-8859-1')