<a href="https://colab.research.google.com/github/harshitaachadha/Data-Mining-Script-Rating-Prediction/blob/main/Named_entity_recognition_using_spacy_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To do list -
- [ ] Number of different locations
- [ ] Emotional Archetypes
- [X] lines per character
- [ ] frequent character sets
- [X] Total spoken words
- [X] list of characters


In [None]:
import spacy
import en_core_web_sm
import re
nlp = en_core_web_sm.load()

In [None]:
import pandas as pd
df = pd.read_csv('ready_for_mining_script.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text script,Title,Episode,Season
0,0,"[Scene: The Subway, Phoebe is singing for\ncha...",The Pilot-The Uncut Version,1.0,1.0
1,1,"[Scene Central Perk, everyone's there.]\nMonic...",The One With The Sonogram At the End,2.0,1.0
2,2,"[Scene: Chandler and Joey's, Chandler is helpi...",The One With The Thumb,3.0,1.0
3,3,"[Scene: Central Perk, Ross and Monica are watc...",The One With George Stephanopoulos,4.0,1.0
4,4,"[Scene: Central Perk, all six are there.]\nMon...",The One With The East German Laundry Detergant,5.0,1.0


In [None]:
def extract_character_names(text):
    doc = nlp(text)
    pattern = r'(\w+\s*):'
    matches = re.findall(pattern, text)
    filtered_matches = [match for match in matches if any(ent.text == match for ent in doc.ents)]
    unique_character_names = list(set(filtered_matches))
    return unique_character_names

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
spark = SparkSession.builder.appName("CharacterExtraction").getOrCreate()
extract_character_names_udf = udf(extract_character_names, ArrayType(StringType()))

In [None]:
# Apply the UDF to create the 'Character List' column
df = spark.createDataFrame(df)
df = df.withColumn("Character List", extract_character_names_udf(df["text script"]))
# Show the DataFrame
#df.show(truncate=False)

In [None]:
#DO NOT RUN THIS AGAIN IT WAS A CHECK THAT PASSED !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
from pyspark.sql.functions import size
df.filter(size("Character List") == 0).count()

0

In [None]:
#so now we have a character list for each episode that is non-empty. I will still not do any text cleaning becuase episodes have different ways in which character
#lines are represented instead i'll first get the words per character per episode.
from pyspark.sql.types import MapType, StringType
import re
def count_words(text):
    words_spoken = {}
    last_character = None
    lines = text.split('\n')
    for line in lines:
        # Remove text within square brackets and parentheses
        line = re.sub(r'\[.*?\]|\(.*?\)', '', line)
        if ':' in line:
            character, spoken_text = line.split(':', 1)
            character = character.strip()
            spoken_text = spoken_text.strip()
            if character not in words_spoken:
                words_spoken[character] = 0
            words = spoken_text.split()
            words_spoken[character] += len(words)
            last_character = character
        elif last_character is not None:
            # Append text to the last character's count
            spoken_text = line.strip()
            words = spoken_text.split()
            words_spoken[last_character] += len(words)
    return words_spoken


count_words_udf = udf(count_words, MapType(StringType(), StringType()))
df = df.withColumn("WordsSpoken", count_words_udf(df["text script"]))
df.show()


+----------+--------------------+--------------------+-------+------+--------------------+--------------------+
|Unnamed: 0|         text script|               Title|Episode|Season|      Character List|         WordsSpoken|
+----------+--------------------+--------------------+-------+------+--------------------+--------------------+
|         0|[Scene: The Subwa...| The Pilot-The Un...|    1.0|   1.0|[Monica, Frannie,...|{All -> 27, Phoeb...|
|         1|[Scene Central Pe...| The One With The...|    2.0|   1.0|[Monica, Susan, R...|{All -> 5, Phoebe...|
|         2|[Scene: Chandler ...| The One With The...|    3.0|   1.0|[Monica, Rachel, ...|{Phoebe -> 427, A...|
|         3|[Scene: Central P...| The One With Geo...|    4.0|   1.0|[Monica, Rachel, ...|{Receptionist -> ...|
|         4|[Scene: Central P...| The One With The...|    5.0|   1.0|[Monica, Angela, ...|{Phoebe -> 235, B...|
|         5|[Scene: The Theat...| The One With The...|    6.0|   1.0|[Monica, Rachel, ...|{All -> 56, Ph

In [None]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

# Define a UDF to calculate the sum of word counts in the "WordsSpoken" structure
def calculate_total_words(words_spoken):
    total = 0
    for character, word_count in words_spoken.items():
        total += word_count
    return total

# Register the UDF
calculate_total_words_udf = udf(calculate_total_words, IntegerType())

# Apply the UDF to the DataFrame to create a new column "TotalWords"
df = df.withColumn("TotalWords", calculate_total_words_udf(df["WordsSpoken"]))

# Show the resulting DataFrame
df.show()


+----------+--------------------+--------------------+-------+------+--------------------+--------------------+----------+
|Unnamed: 0|         text script|               Title|Episode|Season|      Character List|         WordsSpoken|TotalWords|
+----------+--------------------+--------------------+-------+------+--------------------+--------------------+----------+
|         0|[Scene: The Subwa...| The Pilot-The Un...|    1.0|   1.0|[Monica, Frannie,...|{All -> 27, Phoeb...|      2290|
|         1|[Scene Central Pe...| The One With The...|    2.0|   1.0|[Monica, Susan, R...|{All -> 5, Phoebe...|      2520|
|         2|[Scene: Chandler ...| The One With The...|    3.0|   1.0|[Monica, Rachel, ...|{Phoebe -> 427, A...|      2336|
|         3|[Scene: Central P...| The One With Geo...|    4.0|   1.0|[Monica, Rachel, ...|{Receptionist -> ...|      2721|
|         4|[Scene: Central P...| The One With The...|    5.0|   1.0|[Monica, Angela, ...|{Phoebe -> 235, B...|      2765|
|         5|[Sce

In [None]:
#DO NOT RUN THIS AGAIN IT WAS A CHECK THAT PASSED !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
from pyspark.sql.functions import size
df.filter(col("TotalWords") == 0).count()

0

In [None]:
import nltk
nltk.download('punkt')
from nltk.tag.stanford import StanfordNERTagger

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def extract_locations_organizations(text):
    PATH_TO_JAR = 'stanford-ner.jar'
    PATH_TO_MODEL = 'english.all.3class.distsim.crf.ser.gz'
    tagger = StanfordNERTagger(model_filename=PATH_TO_MODEL, path_to_jar=PATH_TO_JAR, encoding='utf-8')
    bracket_contents = re.findall(r'\[(.*?)\]', text)
    locations = []
    organizations = []

    for bracket_text in bracket_contents:
        words = word_tokenize(bracket_text)
        ner_tags = tagger.tag(words)

        current_location = []
        current_organization = []

        for word, tag in ner_tags:
            if tag == 'LOCATION':
                current_location.append(word)
            elif tag == 'ORGANIZATION':
                current_organization.append(word)

        if current_location:
            locations.append(" ".join(current_location))
        if current_organization:
            organizations.append(" ".join(current_organization))

    return locations + organizations

In [None]:
extract_entities_udf = udf(extract_locations_organizations, ArrayType(StringType()))
df = df.withColumn('n', extract_entities_udf(df['text script']))
df.show()

+----------+--------------------+--------------------+-------+------+--------------------+--------------------+----------+--------------------+
|Unnamed: 0|         text script|               Title|Episode|Season|      Character List|         WordsSpoken|TotalWords|                   n|
+----------+--------------------+--------------------+-------+------+--------------------+--------------------+----------+--------------------+
|         0|[Scene: The Subwa...| The Pilot-The Un...|    1.0|   1.0|[Monica, Frannie,...|{All -> 27, Phoeb...|      2290|           [Iridium]|
|         1|[Scene Central Pe...| The One With The...|    2.0|   1.0|[Monica, Susan, R...|{All -> 5, Phoebe...|      2520|      [Central Park]|
|         2|[Scene: Chandler ...| The One With The...|    3.0|   1.0|[Monica, Rachel, ...|{Phoebe -> 427, A...|      2336|[Central Perk, Ir...|
|         3|[Scene: Central P...| The One With Geo...|    4.0|   1.0|[Monica, Rachel, ...|{Receptionist -> ...|      2721|              

In [None]:
#DO NOT RUN THIS AGAIN IT WAS A CHECK THAT PASSED !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
from pyspark.sql.functions import size
df.filter(size("n") == 0).count()