In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Final Project /Datasets/scripts/processed_scripts.csv')

In [None]:
df = pd.read_csv('/content/location_extraction_results.csv')

In [None]:
from transformers import pipeline

# Initialize a named entity recognition pipeline
# The model used here is pre-trained for NER tasks
ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", grouped_entities=True)

# Function to extract locations
def extract_locations(script_text):
    """
    Extracts location-related entities (LOC, GPE, etc.) from the given text using the NER pipeline.
    """
    if pd.isnull(script_text) or script_text.strip() == "":
        return []  # Return an empty list if the text is null or empty

    # Use the NER pipeline to process the text
    entities = ner(script_text)

    # Extract entities tagged as 'LOC' or 'GPE' (Geopolitical Entity)
    locations = [entity['word'] for entity in entities if entity['entity_group'] in ['LOC', 'GPE']]

    return locations

# Define batch processing function
def process_in_batches(data, batch_size, output_file):
    """
    Processes the dataset in batches and saves results incrementally to avoid memory issues.
    """
    total_rows = len(data)
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        batch = data.iloc[start_idx:end_idx]

        # Apply location extraction to the current batch
        batch['Extracted Locations'] = batch['Script/Subtitle Text'].apply(extract_locations)

        # Append the results to the output file
        if start_idx == 0:  # If it's the first batch, write the header
            batch.to_csv(output_file, mode='w', index=False)
        else:  # Append without writing the header
            batch.to_csv(output_file, mode='a', header=False, index=False)

        print(f"Processed rows {start_idx} to {end_idx} out of {total_rows}")

# Set batch size and output file
batch_size = 500  # Adjust batch size based on your memory and performance needs
output_file = "location_extraction_results.csv"

# Process dataset in batches
process_in_batches(data, batch_size, output_file)

print(f"Batch processing completed! Results saved to {output_file}.")


In [None]:
df.shape

(18500, 5)

In [None]:
df

Unnamed: 0.1,Unnamed: 0,Movie Name,Script/Subtitle Text,Processed Script,Extracted Locations
0,0,My Sassy Girl,- This is the storyof the first and last timeI...,"['Ġthis', 'Ġis', 'Ġthe', 'Ġstory', 'Ġthe', 'Ġf...","['French Lick', 'Indiana']"
1,1,Friday After Next,"'Twas the night before Christmas,and all throu...","['tw', 'Ġthe', 'Ġnight', 'Ġbefore', 'Ġchrist',...",[]
2,2,Zillion,"1If you're stupid, thenyou can easily hide tha...","['Ġyou', 'Ġstupid', 'Ġthen', 'Ġcan', 'Ġeasily'...","['Meise', 'Meise', 'China']"
3,3,2002,"This is roof.Where did she fall from?Sir, she ...","['Ġis', 'Ġroof', 'Ġdid', 'Ġshe', 'Ġfall', 'Ġfr...",[]
4,4,Area 51,"1Reid, Ben, and Darrin and I,we've been friend...","['id', 'Ġben', 'Ġand', 'Ġd', 'arr', 'Ġand', 'Ġ...",[]
...,...,...,...,...,...
18495,18495,We Dive at Dawn,"- Nearly at the boom, sir.- Yes, we'll be in b...","['Ġnearly', 'Ġat', 'Ġthe', 'Ġboom', 'Ġsir', 'Ġ...",[]
18496,18496,Yellow Canary,CHURCH BELLS RINGCLOCK STRIKES NINEI'm afraid ...,"['church', 'Ġbells', 'Ġring', 'clock', 'Ġstrik...","['British Museum', 'Circle', 'WA', '##ockspur'..."
18497,18497,Young and Willing,"Well, what do you think?Is she woman or is she...","['well', 'Ġwhat', 'Ġdo', 'Ġyou', 'Ġthink', 'Ġs...","['Empire State Building', 'York']"
18498,18498,The Abandoned,[INSTRUMENTAL MUSIC][INSTRUMENTAL MUSIC][SIREN...,"['Ġmostly', 'Ġcloudy', 'Ġscattered', 'Ġshowers...",['RA']


In [None]:
df1 = pd.read_csv('/content/location_extraction_results_last_4100.csv')

In [None]:
columns_to_drop = ["Unnamed: 0"]
df1 = df1.drop(columns=columns_to_drop, errors="ignore")
df1


Unnamed: 0,Movie Name,Script/Subtitle Text,Processed Script,Extracted Locations
0,Alaska,1Quincy Air Serviceto Super Cub 909-Tango.Do y...,"['quin', 'cy', 'Ġair', 'Ġservic', 'eto', 'Ġsup...","['Denali', 'Quincy', 'Earth']"
1,And Now Tomorrow,"That means you can't doanything for me, doesn'...","['Ġmeans', 'Ġyou', 'Ġcant', 'Ġdo', 'anything',...","['Vienna', 'London', 'Johns Hopkins', 'Rochest..."
2,Arsenic and Old Lace,ARSENIC AND OLD LACEWritten by\r\n\r\n ...,"['ars', 'enic', 'Ġand', 'Ġold', 'Ġlac', 'ew', ...","['Brooklyn', 'New York', 'Brooklyn', 'Brooklyn']"
3,Atlantic City,"Look, it's Ganesh!It's a sign from heaven.Good...","['look', 'Ġits', 'Ġg', 'esh', 'Ġa', 'Ġsign', '...","['Atlantic City', 'Atlantic City', 'Utah', 'Co..."
4,Aventure malgache,The entire world has heard aboutthe dramatic e...,"['Ġentire', 'Ġworld', 'Ġhas', 'Ġheard', 'Ġabou...","['Empire', '##lwyn Studios', 'England', 'Great..."
...,...,...,...,...
4144,Yellow Sky,"[Thunderclap]Hey, look at that!Look. Right thr...","['hey', 'Ġlook', 'Ġat', 'Ġthat', 'look', 'Ġrig...",[]
4145,You Gotta Stay Happy,"We've been waitingsome time.Hey! Hey, pop!Excu...","['Ġbeen', 'Ġwait', 'ings', 'ome', 'Ġtime', 'he...",[]
4146,Mama,Hearing-transcriptedEnglish subtitle by watdde...,"['aring', 'trans', 'cript', 'ed', 'english', '...",[]
4147,A Haunting We Will Go,"Well, gentlemen, we trust you|spent a comforta...","['well', 'Ġgentlemen', 'Ġwe', 'Ġtrust', 'Ġyou'...","['Florida', 'Florida', 'Florida', 'Florida', '..."


In [None]:
df2 = pd.read_csv('/content/location_extraction_results.csv')

In [None]:
columns_to_drop = ["Unnamed: 0"]
df2 = df2.drop(columns=columns_to_drop, errors="ignore")
df2

Unnamed: 0,Movie Name,Script/Subtitle Text,Processed Script,Extracted Locations
0,My Sassy Girl,- This is the storyof the first and last timeI...,"['Ġthis', 'Ġis', 'Ġthe', 'Ġstory', 'Ġthe', 'Ġf...","['French Lick', 'Indiana']"
1,Friday After Next,"'Twas the night before Christmas,and all throu...","['tw', 'Ġthe', 'Ġnight', 'Ġbefore', 'Ġchrist',...",[]
2,Zillion,"1If you're stupid, thenyou can easily hide tha...","['Ġyou', 'Ġstupid', 'Ġthen', 'Ġcan', 'Ġeasily'...","['Meise', 'Meise', 'China']"
3,2002,"This is roof.Where did she fall from?Sir, she ...","['Ġis', 'Ġroof', 'Ġdid', 'Ġshe', 'Ġfall', 'Ġfr...",[]
4,Area 51,"1Reid, Ben, and Darrin and I,we've been friend...","['id', 'Ġben', 'Ġand', 'Ġd', 'arr', 'Ġand', 'Ġ...",[]
...,...,...,...,...
18495,We Dive at Dawn,"- Nearly at the boom, sir.- Yes, we'll be in b...","['Ġnearly', 'Ġat', 'Ġthe', 'Ġboom', 'Ġsir', 'Ġ...",[]
18496,Yellow Canary,CHURCH BELLS RINGCLOCK STRIKES NINEI'm afraid ...,"['church', 'Ġbells', 'Ġring', 'clock', 'Ġstrik...","['British Museum', 'Circle', 'WA', '##ockspur'..."
18497,Young and Willing,"Well, what do you think?Is she woman or is she...","['well', 'Ġwhat', 'Ġdo', 'Ġyou', 'Ġthink', 'Ġs...","['Empire State Building', 'York']"
18498,The Abandoned,[INSTRUMENTAL MUSIC][INSTRUMENTAL MUSIC][SIREN...,"['Ġmostly', 'Ġcloudy', 'Ġscattered', 'Ġshowers...",['RA']


In [None]:
df_combined = pd.concat([df1, df2], axis=0, ignore_index=True)


In [None]:
df_combined

Unnamed: 0,Movie Name,Script/Subtitle Text,Processed Script,Extracted Locations
0,Alaska,1Quincy Air Serviceto Super Cub 909-Tango.Do y...,"['quin', 'cy', 'Ġair', 'Ġservic', 'eto', 'Ġsup...","['Denali', 'Quincy', 'Earth']"
1,And Now Tomorrow,"That means you can't doanything for me, doesn'...","['Ġmeans', 'Ġyou', 'Ġcant', 'Ġdo', 'anything',...","['Vienna', 'London', 'Johns Hopkins', 'Rochest..."
2,Arsenic and Old Lace,ARSENIC AND OLD LACEWritten by\r\n\r\n ...,"['ars', 'enic', 'Ġand', 'Ġold', 'Ġlac', 'ew', ...","['Brooklyn', 'New York', 'Brooklyn', 'Brooklyn']"
3,Atlantic City,"Look, it's Ganesh!It's a sign from heaven.Good...","['look', 'Ġits', 'Ġg', 'esh', 'Ġa', 'Ġsign', '...","['Atlantic City', 'Atlantic City', 'Utah', 'Co..."
4,Aventure malgache,The entire world has heard aboutthe dramatic e...,"['Ġentire', 'Ġworld', 'Ġhas', 'Ġheard', 'Ġabou...","['Empire', '##lwyn Studios', 'England', 'Great..."
...,...,...,...,...
22644,We Dive at Dawn,"- Nearly at the boom, sir.- Yes, we'll be in b...","['Ġnearly', 'Ġat', 'Ġthe', 'Ġboom', 'Ġsir', 'Ġ...",[]
22645,Yellow Canary,CHURCH BELLS RINGCLOCK STRIKES NINEI'm afraid ...,"['church', 'Ġbells', 'Ġring', 'clock', 'Ġstrik...","['British Museum', 'Circle', 'WA', '##ockspur'..."
22646,Young and Willing,"Well, what do you think?Is she woman or is she...","['well', 'Ġwhat', 'Ġdo', 'Ġyou', 'Ġthink', 'Ġs...","['Empire State Building', 'York']"
22647,The Abandoned,[INSTRUMENTAL MUSIC][INSTRUMENTAL MUSIC][SIREN...,"['Ġmostly', 'Ġcloudy', 'Ġscattered', 'Ġshowers...",['RA']


In [None]:
df_combined.to_csv('location_extracted.csv', index=False)