# Data preprocessing

The textual data is processed to extract Part-of-Speech (POS) tags, which are then used as features in an age prediction model. The POS features help the model capture linguistic patterns that are indicative of the age of the author.

In [58]:
# Import required libraries: SpaCy for NLP and pandas for data manipulation
import spacy
import pandas as pd

# Load the small English language model from SpaCy
sp = spacy.load('en_core_web_sm')

In [3]:
# Create a SpaCy Doc object by processing a sentence about playing football
sen = sp(u"I like to play football. I hated it in my childhood though")

# Print the text content of the Doc object
print(sen.text)

I like to play football. I hated it in my childhood though


In [5]:
# Print the coarse-grained POS tag for the 8th token ('football') in the sentence
print(sen[7].pos_)

# Print the fine-grained POS tag for the 8th token ('football') in the sentence
print(sen[7].tag_)

VERB
VBD


In [6]:
# To see what VBD means
print(spacy.explain(sen[7].tag_))

verb, past tense


In [13]:
#  Improve the readability and formatting by adding 12 spaces between the text and coarse-grained POS tag and
#  then another 10 spaces between the coarse-grained POS tags and fine-grained POS tags
for word in sen:
    print(f'{word.text:{12}} {word.pos_:{10}} {word.tag_:{8}} {spacy.explain(word.tag_)}')


I            PRON       PRP      pronoun, personal
like         VERB       VBP      verb, non-3rd person singular present
to           PART       TO       infinitival "to"
play         VERB       VB       verb, base form
football     NOUN       NN       noun, singular or mass
.            PUNCT      .        punctuation mark, sentence closer
I            PRON       PRP      pronoun, personal
hated        VERB       VBD      verb, past tense
it           PRON       PRP      pronoun, personal
in           ADP        IN       conjunction, subordinating or preposition
my           PRON       PRP$     pronoun, possessive
childhood    NOUN       NN       noun, singular or mass
though       ADV        RB       adverb


In [9]:
df = pd.read_excel('../GranTurismoFolgore.xlsx')

In [53]:
def emoji2concat_description(text):
    emoji_list = emoji.emoji_list(text)
    ret = emoji.replace_emoji(text, replace='').strip()
    for json in emoji_list:
        this_desc = ' '.join(emoji.EMOJI_DATA[json['emoji']]['en'].split('_')).strip(':')
        ret += ' ' + this_desc
    return ret

In [54]:
# Create a new column 'Processed_Comment' by applying the emoji2concat_description function to 'Comment'
df['EmoticonComment'] = df['Comment'].apply(emoji2concat_description)

In [55]:
# Process the entire 'Comment' column using SpaCy
df['Comment'] = df['EmoticonComment'].apply(sp)

In [73]:
pos = []
number = 10  # Starting number

for i in df['Comment']:
    # Create a string containing the text, fine-grained POS tag, and coarse-grained POS tag for each token
    comment = f'{number} {i[0].text}/{i[0].tag_}/{i[0].pos_} ' + ' '.join([f'{word.text}/{word.tag_}/{word.pos_}' for word in i[1:]])
   
    # Append the processed comment to the 'pos' list
    pos.append(comment)
    
# Print or use the 'pos' list as needed
for comment in pos:
    print(comment)


10 There/EX/PRON are/VBP/VERB speed/NN/NOUN limits/NNS/NOUN on/IN/ADP Italian/JJ/ADJ roads/NNS/NOUN ,/,/PUNCT but/CC/CCONJ no/DT/DET one/NN/NOUN knows/VBZ/VERB what/WP/PRON it/PRP/PRON is/VBZ/AUX -/:/PUNCT Clarkson/NNP/PROPN ././PUNCT Really/RB/ADV loved/VBD/VERB the/DT/DET visuals/NNS/NOUN and/CC/CCONJ the/DT/DET car/NN/NOUN ././PUNCT
10 we/PRP/PRON honestly/RB/ADV need/VBP/VERB to/TO/PART see/VB/VERB these/DT/PRON in/IN/ADP india/NNP/PROPN ././PUNCT under/IN/ADP the/DT/DET whole/JJ/ADJ ev/RB/ADV -/HYPH/PUNCT only/RB/ADV moving/VBG/VERB india/NNP/PROPN ,/,/PUNCT i/PRP/PRON think/VBP/VERB the/DT/DET folgore/NN/NOUN would/MD/AUX be/VB/AUX a/DT/DET hit/NN/NOUN 
/_SP/SPACE but/CC/CCONJ again/RB/ADV we/PRP/PRON ca/MD/AUX n't/RB/PART forget/VB/VERB the/DT/DET magic/NN/NOUN 165/CD/NUM %/NN/NOUN can/MD/AUX we/PRP/PRON ?/./PUNCT
10 That/DT/PRON sucks/VBZ/VERB !/./PUNCT !/./PUNCT
10 165/CD/NUM what/WP/PRON does/VBZ/AUX it/PRP/PRON mean/VB/VERB ?/./PUNCT
10 Could/MD/AUX really/RB/ADV feel/VB/VER

In [74]:
# Optionally, you can add the 'pos' list as a new column in the DataFrame
df['PosComment'] = pos
df['PosComment']

0       10 There/EX/PRON are/VBP/VERB speed/NN/NOUN li...
1       10 we/PRP/PRON honestly/RB/ADV need/VBP/VERB t...
2       10 That/DT/PRON sucks/VBZ/VERB !/./PUNCT !/./P...
3       10 165/CD/NUM what/WP/PRON does/VBZ/AUX it/PRP...
4       10 Could/MD/AUX really/RB/ADV feel/VB/VERB you...
                              ...                        
1682    10 Unmistakably/NNP/PROPN Ford/NNP/PROPN Puma/...
1683    10 Right/UH/INTJ ?/./PUNCT The/DT/PRON smaller...
1684    10 Front/NN/NOUN  /_SP/SPACE =/VBZ/VERB Maruti...
1685    10 It/PRP/PRON resembles/VBZ/VERB a/DT/DET For...
1686    10 Oh/UH/INTJ look/VB/VERB ,/,/PUNCT an/DT/DET...
Name: PosComment, Length: 1687, dtype: object

In [75]:
df.to_excel("AgeProcessedData.xlsx")

In [82]:
# Extract the 'PosComment' column from the DataFrame
pos_comments = df['PosComment']

# Replace newline characters with spaces to ensure consistent formatting
pos_comments = pos_comments.str.replace('\n', ' ')

# Save the processed comments to a text file, using tab as the separator
pos_comments.to_csv("PosComments.txt", sep='\t', index=False)