### Cleaning of South Park transcripts for analysis

<img src="https://forbes.hu/wp-content/uploads/2021/08/southpark2_forbes.jpg" width="1000"/>


In [2]:
import sys 
sys.path.append('../../')
from src.import_packages import *



In [3]:
data = pd.read_csv('data/transcript_scraped.csv')
data.head(3)

Unnamed: 0,speaker,line,season,episode_num,episode_title
0,The boys,"School days, school days, teacher's golden ru...",1,1,Cartman Gets an Anal Probe
1,Kyle Broflovski,"Ah, damn it! My little brother's trying to fol...",1,1,Cartman Gets an Anal Probe
2,Ike Broflovski,Eat banana.,1,1,Cartman Gets an Anal Probe


### Speakers

In [4]:
print(data.shape)

data = data[~data['speaker'].str.lower().str.contains('\d')]
data = data[~data['speaker'].str.lower().str.contains(',')]
data = data[~data['speaker'].isin(['Man', 'Woman'])]

print(data.shape)

(84229, 5)
(79769, 5)


In [5]:
mapper = {'Eric Cartman': 'Cartman', 'Cartman:': 'Cartman', 'New Cartman': 'Cartman', 'Liane Cartman': 'Liane', 'Kyle.': 'Kyle',
'Kyle:': 'Kyle', 'Kyle Broflovski': 'Kyle', 'Kyle (voiceover)': 'Kyle', 'Sheila Broflovski': 'Sheila', 'Gerald Broflovski': 'Gerald',
'Stan:': 'Stan', 'stan': 'Stan', 'Stan Marsh': 'Stan', 'Randy Marsh': 'Randy', 'Shelly Marsh': 'Shelly', 'Sharon Marsh': 'Sharon',
'"Kenny"': 'Kenny', 'Stuart McCormick': 'Stuart', 'Carol McCormick': 'Carol', 'Kenny McCormick': 'Kenny', 'Mrs. McCormick': 'Carol',
'Ms. McCormick': 'Carol', 'Garrison' : 'Mr. Garrison', 'Mr Garrison' : 'Mr. Garrison', 'Herbert Garrison' : 'Mr. Garrison', 
'Mr. Garrison:' : 'Mr. Garrison', 'Mackey' : 'Mr. Mackey', 'Mr Mackey' : 'Mr. Mackey', 'Mr.Mackey' : 'Mr. Mackey',
'Doctor' : 'Dr. Doctor', 'Dr. Mephesto' : 'Mephesto',
'Pricipal Victoria' : 'Principal Victoria', 'Victoria' : 'Principal Victoria', 'Barbrady' : 'Officer Barbrady', 'Ike Broflovski' : 'Ike',
'Mayor' : 'Mayor McDaniels', 'Mayor McDanniels' : 'Mayor McDaniels', 'Mayor McDaniels.' : 'Mayor McDaniels', 'Mayor McDaneils' : 'Mayor McDaniels'}

data['speaker'].replace(mapper, inplace = True)

In [6]:
# data[data['speaker'].str.lower().str.contains('man')]['speaker'].value_counts()

In [7]:
data['speaking_to'] = data.groupby('episode_title')['speaker'].shift(-1)

In [8]:
line_count = data['speaker'].value_counts()
drop_speakers = line_count[line_count <= 300].index.tolist()

data.loc[data['speaker'].isin(drop_speakers), 'speaker'] = 'Other'
data.loc[data['speaking_to'].isin(drop_speakers), 'speaking_to'] = 'Other'

In [9]:
#data['speaker'].value_counts()

That reduced all speakers to the ~top30 + an 'Other' category containing everybody else (won't use for analysis, modeling, etc...)

### Text

Clean from special characters, narrating sentences, numbers

In [10]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

STOP_WORDS = stopwords.words('english')
ADDITIONAL_STOP_WORDS = ['hello', 'hi', 'oh', 'get', 'yeah', 'well', 'like', 'gonna', 'let', 'okay', 'people', 'u', 'uh', 'hey', 'would', 'got', 'one', 'going', 'know', 'right', 'go', 'come']
lemmatizer = WordNetLemmatizer() 

cList = {
    "ain't": "am not",    "aren't": "are not",    "can't've": "cannot have",    "can't": "cannot",    "'cause": "because",    "could've": "could have",    "couldn't've": "could not have",
    "couldn't": "could not",    "didn't": "did not",    "doesn't": "does not",    "don't": "do not",    "hadn't've": "had not have",    "hadn't": "had not",        "hasn't": "has not",
    "haven't": "have not",    "he'd've": "he would have",    "he'd": "he would",      "he'll've": "he will have",    "he'll": "he will",        "he's": "he is",    "how'd'y": "how do you",
    "how'd": "how did",      "how'll": "how will",    "how's": "how is",    "i'd've": "i would have",    "i'd": "i would",    "i'll've": "i will have",    "i'll": "i will",    
    "i'm": "i am",    "i've": "i have",    "isn't": "is not",    "it'd've": "it would have",    "it'd": "it would",    "it'll've": "it will have",      "it'll": "it will",      "it's": "it is",
    "let's": "let us",    "ma'am": "madam",    "mayn't": "may not",    "might've": "might have",    "mightn't've": "might not have",    "mightn't": "might not",        "must've": "must have",
    "mustn't've": "must not have",    "mustn't": "must not",    "needn't've": "need not have",      "needn't": "need not",      "oughtn't've": "ought not have",    "oughtn't": "ought not",
    "shan't've": "shall not have",    "shan't": "shall not",    "sha'n't": "shall not",    "she'd've": "she would have",    "she'd": "she would",    "she'll've": "she will have",
    "she'll": "she will",      "she's": "she is",    "should've": "should have",    "shouldn't've": "should not have",    "shouldn't": "should not",      "so've": "so have",
    "so's": "so is",    "that'd've": "that would have",    "that'd": "that would",      "that's": "that is",    "there'd've": "there would have",    "there'd": "there had",  
    "there's": "there is",    "they'd've": "they would have",    "they'd": "they would",    "they'll've": "they will have",    "they'll": "they will",      "they're": "they are",
    "they've": "they have",    "to've": "to have",    "wasn't": "was not",    "we'd've": "we would have",    "we'd": "we would",    "we'll've": "we will have",      "we'll": "we will",  
    "we're": "we are",    "we've": "we have",    "weren't": "were not",    "what'll've": "what will have",    "what'll": "what will",        "what're": "what are",    "what's": "what is",
    "what've": "what have",    "when's": "when is",    "when've": "when have",    "where'd": "where did",    "where's": "where is",    "where've": "where have",    "who'll've": "who will have",
    "who'll": "who will",        "who's": "who is",    "who've": "who have",    "why's": "why is",    "why've": "why have",    "will've": "will have",    "won't've": "will not have",
    "won't": "will not",        "would've": "would have",    "wouldn't've": "would not have",    "wouldn't": "would not",    "y'all'd've": "you all would have",    "y'all'd": "you all would",
    "y'all're": "you all are",    "y'all've": "you all have",    "y'all": "you all",    "y'alls": "you all",      "you'd've": "you would have",    "you'd": "you had",
    "you'll've": "you will have",    "you'll": "you will",      "you're": "you are",    "you've": "you have" }

def decontracted(phrase):
    
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase

def text_cleaner(text, treat_contractions = True, remove_stopwords = True, lemmatize = True):  
    
    # lower chars and treat whitespaces
    text = text.lower()
    text = re.compile('\s+').sub(' ', text) # whitespaces
    text = text.strip() # trailing whitespaces  

    # delete narration part between [ and ]
    text = re.sub("[\[].*?[\]]", "", text)

    # contractions
    if treat_contractions:
        text = decontracted(text)
        text = ' '.join([cList.get(word,word) for word in text.split()])

    # delete special characters and numbers
    text = re.compile('[^a-z]').sub(' ', text) 

    # after numbers
    text = re.sub(' th ', '', text)
    text = re.sub(' st ', '', text)
    text = re.sub(' nd ', '', text)
    text = re.compile('\s+').sub(' ', text) # whitespaces
    text = re.compile('\s+').sub(' ', text) # whitespaces
    text = text.strip() # trailing whitespaces 

    # stopwords
    if remove_stopwords:        
        text = ' '.join([word for word in text.split() if word not in STOP_WORDS + ADDITIONAL_STOP_WORDS])

    # lemmatize   
    if lemmatize:
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    # stopwords after lemmatization
    if remove_stopwords:        
        text = ' '.join([word for word in text.split() if word not in STOP_WORDS + ADDITIONAL_STOP_WORDS])

    # final whitespace treatment
    text = re.compile('\s+').sub(' ', text) # whitespaces
    text = text.strip() # trailing whitespaces     

    return text

In [11]:
%%time

data['line_clean'] = data['line'].apply(lambda x: text_cleaner(x))

Wall time: 17.3 s


In [12]:
data = data[data['line_clean'] != '']

Check word count for more possible stop words

In [13]:
from nltk.util import ngrams
from nltk.collections import Counter

In [14]:
all_text = ' '.join(data['line_clean'])
counter = Counter(ngrams(all_text.split(' '), 1))

print('Unique words in text:', len(counter))

Unique words in text: 23116


In [15]:
# pd.DataFrame.from_dict(counter, orient = 'index')\
#     .reset_index()\
#     .sort_values(0, ascending = False)\
#     .reset_index(drop = True)\
#     .rename(columns = {'index' : 'token', 0 : 'count'}).head(50)

In [16]:
data.to_csv('data/transcript_cleaned.csv', index = False)