In [1]:
# Load packages
import os
import pandas as pd
import json
import spacy
from spacy.matcher import Matcher
nlp=spacy.load("en_core_web_sm")

In [None]:
# Load and Mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

os.chdir(" ") #change directory
os.listdir() # Check

Mounted at /content/drive


['MovieDataClean2.xlsx',
 'test.xlsx',
 'mwe.json',
 'Test2.xlsx',
 'colab_pdf.py',
 '__pycache__',
 'tfidf_movies.csv',
 'with_similarity.csv',
 'old_data',
 'Test3.xlsx',
 'mwe2.json',
 'mwe3.json',
 'MovieData_Spacy.xlsx',
 'intensifiers.xlsx']

In [6]:
# Load data:

df = pd.read_excel("MovieDataClean2.xlsx", index_col=0)
df.head()

Unnamed: 0_level_0,MovieID,MovieTitle,MovieYear,NWriters,WriterGender,Gender,Text
Column1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,m0,10 things i hate about you,1999,2,W,F,They do not! I hope so. Let's go. Okay -- you'...
2,m0,10 things i hate about you,1999,2,W,M,"They do to! She okay? Wow No The ""real you"". I..."
3,m1,1492: conquest of paradise,1992,1,W,F,
4,m1,1492: conquest of paradise,1992,1,W,M,Asia can be found to the west -- and I will pr...
5,m10,affliction,1997,1,M,F,All the figures show is that Gordon LaRiviere ...


In [7]:
# Import list of single-word intensifiers
dfintensifiers = pd.read_excel("intensifiers.xlsx", sheet_name="intensifiers_singleword")
print(dfintensifiers.head())
intensifierslist = list(dfintensifiers.Intensifier)

  Intensifier        Source
0  absolutely  Fuchs (2017)
1         all  Fuchs (2017)
2  altogether  Fuchs (2017)
3   amazingly  Fuchs (2017)
4       awful  Fuchs (2017)


In [8]:
# Import list of multiword expressions
dfmwe = pd.read_excel("intensifiers.xlsx", sheet_name="intensifiers_mwe")
print(dfmwe.head())
mwelist = list(dfmwe.Intensifier)

    Intensifier        Source
0        by far  Fuchs (2017)
1       quite a  Fuchs (2017)
2      quite an  Fuchs (2017)
3   *Num* times          USAS
4  a darn sight          USAS


In [12]:
# Dictionary of multiword expressions with pos tags
with open('mwedict.json', 'r') as file:
  mwe_dict = json.load(file)

In [10]:
# Safe divide

def safe_divide(a, b):
    if b != 0: #
        return a/b
    else:
        return 0

In [11]:
# NLP pipeline
movie_docs = list(nlp.pipe(df.Text.fillna(''), disable=["parser", "ner", "entity_ruler",
    "entity_linker", "span_ruler", "textcat", "morphologizer", "transformer", "coref"], batch_size=10))

In [13]:
nw_final = [] # number of words per document
intensifiers_final = [] # number of intensifiers per document

matcher = Matcher(nlp.vocab) # load patterns to match
for entry in mwe_dict:
    matcher.add(entry["name"], [entry["pattern"]])

#loop for each document in spacy doc
for doc in movie_docs:
    nw = 0 #starting count for number of words
    intensifiers = 0 # starting count for number of intensifiers

    for token in doc: #for token in each doc
        if token.is_punct or token.is_space:
          continue
        nw += 1 # count tokens that are not punctuation or spaces

    # Count MWE matches and mark covered token indices to avoid double-counting
    matches = matcher(doc)
    intensifiers += len(matches)

    covered = set()
    for _, start, end in matches:
        covered.update(range(start, end)) # tokens that are part of mwe matches

    # Count single-word intensifiers, skipping tokens inside MWEs
    for i, token in enumerate(doc):
        if token.is_punct or token.is_space:
            continue
        if i in covered:
            continue
        if token.pos_ == "ADV" or token.pos_ == "ADJ" and token.lower_ in intensifierslist:
            intensifiers += 1 # Count single-word intensifiers that are adverbs

    nw_final.append(nw) #append number of words by text to list above (creating a list of intensifiers)
    intensifiers_final.append(safe_divide(intensifiers, nw)*1000000) # intensifiers per million words

df['NW'] = nw_final
df['intensifiers'] = intensifiers_final

In [14]:
# Save for analysis
print(df.head())
df.to_excel("MovieData_Spacy.xlsx")

        MovieID                  MovieTitle  MovieYear  NWriters WriterGender  \
Column1                                                                         
1            m0  10 things i hate about you       1999         2            W   
2            m0  10 things i hate about you       1999         2            W   
3            m1  1492: conquest of paradise       1992         1            W   
4            m1  1492: conquest of paradise       1992         1            W   
5           m10                  affliction       1997         1            M   

        Gender                                               Text    NW  \
Column1                                                                   
1            F  They do not! I hope so. Let's go. Okay -- you'...  2446   
2            M  They do to! She okay? Wow No The "real you". I...  2996   
3            F                                                NaN     0   
4            M  Asia can be found to the west -- and I wi