# **DATA UNDERSTANDING and PREPROCESSING** <br>

In the preprocessing phase of our notebook, we undertook a comprehensive set of tasks to prepare our text data for classification. The following key steps were implemented:

1. Text Cleaning
2. Feature Enrichment
3. Feature Selection
4. Imbalanced Learning (Random Undersampling)
5. Tokenization and Counting

#Importing libraries

In [None]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
import collections as ct
from sklearn.feature_extraction.text import CountVectorizer
from senticnet.senticnet import SenticNet
from sklearn.preprocessing import LabelEncoder
sn = SenticNet()
le = LabelEncoder()

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from wordfreq import word_frequency

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.simplefilter('ignore')

# First approach to the datasets

open id - text dataset

In [None]:
alltweet = pd.read_table('dataset_raw/figurative_clean.tsv', header= None)
alltweet.columns = ('id', 'text')
alltweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7775 entries, 0 to 7774
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7775 non-null   int64 
 1   text    7775 non-null   object
dtypes: int64(1), object(1)
memory usage: 121.6+ KB


open id - score dataset

In [None]:
id_tweet = pd.read_excel('dataset_raw/task-11-training-data-integer.xls', header= None)
id_tweet.columns = ('id', 'score')
id_tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      8000 non-null   int64
 1   score   8000 non-null   int64
dtypes: int64(2)
memory usage: 125.1 KB


check if there're duplicate ids

In [None]:
duplicate_t = np.unique(id_tweet['id'], return_counts= True)
indices = np.where(duplicate_t[1] > 1)[0]
indices

array([], dtype=int64)

open id - frames dataset

In [None]:
text_tweet = pd.read_table('dataset_raw/figurative_cleanONLYFRAMES.tsv', header= None)
text_tweet.columns = ('id', 'frame')
text_tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7825 entries, 0 to 7824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7825 non-null   int64 
 1   frame   7225 non-null   object
dtypes: int64(1), object(1)
memory usage: 122.4+ KB


merge of the datasets

In [None]:
df = pd.merge(id_tweet, alltweet, on = 'id')
df = pd.merge(df, text_tweet, on = 'id')
print(df.shape)
df.head()

(8336, 4)


Unnamed: 0,id,score,text,frame
0,472189928340606976,-4,I just love working for 6.5 hours without a br...,Ranked_expectation Experiencer_focus Measure_d...
1,472440774785650688,-4,The happy song does not invoke good feelings. ...,Text Desirability Expertise Social_interaction...
2,473085653454827520,-2,Having to run to the train first thing in the ...,Cause_impact Cause_motion Impact Self_motion V...
3,463445012374499328,-1,@OmniJerBear haha should have had at the end,Process_end
4,463501257110724610,-1,Really excited for these last few days of scho...,Calendric_unit Measure_duration Timespan


In [None]:
a = np.where(df['frame'].isnull())
df.drop(a[0], inplace= True)
df.reset_index(drop= True, inplace= True)

check if there're duplicate tweets

In [None]:
duplicate_t = np.unique(df['id'], return_counts= True)
indices = np.where(duplicate_t[1] > 1)[0]

In [None]:
dupl_inx = dict()
for i in range(len(duplicate_t[0])):
    if duplicate_t[1][i] > 1:
        dupl_inx[duplicate_t[0][i]] = duplicate_t[1][i]

In [None]:
# remove duplicate ids except one, O(459*n) very slow
# BETTER NOT TO RUN THIS CODE AGAIN: it is quite slow (about 4 minutes);
# however the resulting dataset is saved a couple of cells below and it is then loaded for subsequently analysis as 'text_dataset.csv', so you can see from there

for k,v in dupl_inx.items():
        for i,row in df.iterrows():
            if v > 1:
                if row['id'] == k:
                    df.drop(i, axis = 0, inplace=True)
                    v-=1


In [None]:
max(np.unique(df['id'], return_counts= True)[1])

1

## Add frames to the dataset

In [None]:
frames = set()
i = 0
for row in df['frame']:
    cat = row.split(' ')
    for c in cat:
        frames.add(c)
    i += 1

for f in frames:
    df[f] = df['frame'].str.contains(f)
    df[f] = df[f].astype(int)

In [None]:
df.to_csv('text_dataset.csv', sep = ',', header= True, index=False)

# Text cleaning

In [None]:
# define my punctuation
my_punct = ['"', '$', '%', '&', "'", '(', ')', '*', '+', ',',
           '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_',
           '`', '{', '|', '}', '~', '»', '«', '“', '”', '#', ]

punct_pattern = re.compile("[" + re.escape("".join(my_punct)) + "]")

In [None]:
def clean_text(text):
    #remove tag and links
    ctext = re.sub(r"(?:\@|https?\://)\S+", "", text)
    emo = re.findall(r'\w*[:;=][-D3BPcoO\)\(\\]\(*\)*|<3|XD|\^\-\^', ctext)
    hashtag = re.findall(r'#\w+', text)
    #remove double spaces
    ctext = re.sub(r"  ", " ", ctext)
    # instagram/images link
    ctext = re.sub(r'instagram.com/\w*/\w*[.-]|pinterest.com/pin/\d+|pic.twitter.com/\w+|https?://\S+|www\.\S+|\w+\.\w+\/\w+', "", ctext)
    #'useless' punctuation
    ctext = re.sub(punct_pattern, "", ctext)
    #points only after words
    ###
    #return only the presence/absence of an emoticon/hashtag in the text
    return ctext, len(emo)>0, len(hashtag)>0

In [None]:
file_tsv = "dataset_raw/text_dataset.csv"
df = pd.read_csv(file_tsv, encoding='utf-8') #names=['id','text'],

# define function to clean the text of a dataframe
# for this task we pass all the dataset and clean the data from
# the column 'text'
def clean(df):
    cleaned_text = []
    exc_mark = []
    emoticon = []
    hashtag = []
    text_proc = []
    # stopword deletion
    stop_words = set(stopwords.words('english'))
    for tweet in df['text']:
        ct, emo, hash = clean_text(tweet.lower())
        ct = [word for word in ct.split() if word not in stop_words] #split the text
        text_proc.append(' '.join(ct)) # merge the part in the list, so it's one string
        cleaned_text.append(ct)
        emoticon.append(int(emo))
        hashtag.append(int(hash))
        #exc_mark.append(count_thing(tweet, '!'))

    df['text_list'] = cleaned_text
    df['text'] = text_proc
    df['emoticon'] = emoticon
    df['hashtag'] = hashtag

clean(df)

# Feature enrichment
<br> Here we add features coming from the paper [Francesco Barbieri and Horacio Saggion, 2014, Modelling Irony in Twitter](https://aclanthology.org/E14-3007.pdf), which  are *Syno_Lower_Mean* and *Syn_Mean*. These features quantify the use of synonyms in tweets by calculating the mean frequency of selected synonyms with frequencies lower than the original word (Syno_Lower_Mean) and all synonyms (Syn_Mean), providing insights into the intentional use of less common words in ironic communication


In [None]:
# function to compute the features from text of the tweets
# df is passed as parameter to add directly the columns
# returns all the data structures containing polarity, sentics values,....
def add_feature(df):
    #define all the structures for the data
    syno_lower_mean = []
    syn_mean = []
    polarity_label = []
    sentics_values = {}
    moodtags = []
    semantics = []
    all_moods = set()

    for tweet in df['text_list']:
        #inizialize aggragate values for the tweet
        sum_syn = 0
        n_synset = 0
        polarity = []
        tweet_values = dict()
        mood_data = []
        sem_data = []

        for w in tweet:

            try:
                #retrieve all the sentic data for a word
                senticnet_data = sn.concept(w)

                #sum the values by key, if the word is the first, the dictionary is initialized
                diz_sen = senticnet_data['sentics']
                for k,v in diz_sen.items():
                    tweet_values[k] = tweet_values.get(k, 0) + float(v)

                mood_data.extend([tag.lstrip('#') for tag in senticnet_data['moodtags']]) # Remove '#' from moodtags to create an additional label

                for t in mood_data:
                    all_moods.add(t)

                sem_data.extend([tag.lstrip('#') for tag in senticnet_data['semantics']]) #same thing


                polarity.append(senticnet_data['polarity_label'])

            except:
                None

            # feature from the paper
            threshold = word_frequency(w, 'en') # set the frequency value for the word
            n_synset += len(wordnet.synsets(w, lang = 'eng')) #retrieve the synsets number for the tweet

            #sum all the rare synonyms of a word
            for syn in wordnet.synsets(w, lang='eng'):
                for i in syn.lemmas():
                    if word_frequency(i.name(), 'en') < threshold:
                        sum_syn += 1

        # polarity of a tweet
        v,c = np.unique(polarity, return_counts=True)
        if len(c) > 0 and len(v) > 0 :
            polarity_label.append(list(v)[np.argmax(list(c))]) #majority voting for the tweets polarity
        else:
            polarity_label.append('neutral') #neutral for the ones without information

        #mean values
        if len(tweet_values) > 0:
            for k,v in tweet_values.items():
                sentics_values[k] = sentics_values.get(k, []) + [v/len(tweet)]
        else:
            for k,v in sentics_values.items():
                sentics_values[k] = sentics_values.get(k, []) + [0]

        syno_lower_mean.append(sum_syn/len(tweet))
        syn_mean.append(n_synset/len(tweet))
        moodtags.append(list(set(mood_data)))
        semantics.append(list(set(sem_data)))

    # first version of the dataset
    df['syno_lower_mean'] = syno_lower_mean
    df['syn_mean'] = syn_mean
    return polarity_label, sentics_values, moodtags, all_moods, semantics

In [None]:
polarity_label, sentics_values, moodtags, all_moods, semantics = add_feature(df)

## Here we create different versions of the same dataset <br>
first we simply balance the classes and use the data as they are, label, text, ids, and frames

# Imbalance Learning

In [None]:
X = df.drop('score', axis = 1)
X = X.drop('text_list', axis = 1)
y = (df['score'] < 0).astype(int)

minority = min(y.value_counts())

rus = RandomUnderSampler(sampling_strategy={0: minority, 1: minority}, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.33, random_state=42)

# Standardization of the syn_mean and syno_lower_mean columns

In [None]:
scaler = MinMaxScaler()
c = X_train['syn_mean'].values.reshape(-1,1)
scaler.fit(c)
X_train['syn_mean'] = scaler.transform(c)
X_test['syn_mean'] = scaler.transform(X_test['syn_mean'].values.reshape(-1,1))

scaler = MinMaxScaler()
c = X_train['syno_lower_mean'].values.reshape(-1,1)
scaler.fit(c)
X_train['syno_lower_mean'] = scaler.transform(c)
X_test['syno_lower_mean'] = scaler.transform(X_test['syno_lower_mean'].values.reshape(-1,1))

In [None]:
y_train = y_train.values.reshape(-1,1)
X_train.reset_index(inplace = True, drop = True)

y_test = y_test.values.reshape(-1,1)
X_test.reset_index(inplace = True, drop = True)

In [None]:
df_train = pd.concat([pd.DataFrame(y_train), X_train], axis = 1)
df_test = pd.concat([pd.DataFrame(y_test), X_test], axis = 1)

In [None]:
df_train.to_csv('dataset_first_task/traindata_frames_syno_nofeatureselection.csv', index=False)
df_test.to_csv('dataset_first_task/testdata_frames_syno_nofeatureselection.csv', index=False)

## Then we use a version with the 1000 most frequent words after the tokenization

In [None]:
vect = CountVectorizer(stop_words='english', max_features = 1000)
vect.fit(X_train['text'])
train_vect_df = vect.transform(X_train['text'])
test_vect_df = vect.transform(X_test['text'])

In [None]:
train_vect_df = pd.DataFrame(train_vect_df.toarray())
train_vect_df.columns = vect.get_feature_names_out()

test_vect_df = pd.DataFrame(test_vect_df.toarray())
test_vect_df.columns = vect.get_feature_names_out()

In [None]:
df_train = pd.concat([df_train, train_vect_df], axis = 1)
df_train.to_csv('dataset_first_task/traindata_frames_wordvect_syno_nofeatureselection.csv', index=False)
df_test = pd.concat([df_test, test_vect_df], axis = 1)
df_test.to_csv('dataset_first_task/testdata_frames_wordvect_syno_nofeatureselection.csv', index = False)

## Then a version adding senticnet features

In [None]:
df['polarities'] = polarity_label
df['polarities'] = le.fit_transform(df['polarities'])
for k,v in sentics_values.items():
    df[k] = v
df['mood'] = moodtags
df['semantics'] = semantics

#create moodtags column
columns = dict()
for i,r in df.iterrows():
    for e in all_moods:
        if e in r['mood']:
            columns[e] = columns.get(e, []) + [1]
        else:
            columns[e] = columns.get(e, []) + [0]

for k,v in columns.items():
    df[k+'_mood'] = v

In [None]:
#create a string with all the semantics for a tweet
prova_sem = []
for t in df['semantics']:
    all = ''
    for v in t:
        all += v + ' '
    prova_sem.append(all)
df['semantics'] = prova_sem

#use countvectorizer to create all the semantics columns
vect = CountVectorizer()
vect = CountVectorizer(stop_words='english')
vect.fit(df['semantics'])
sentic_vect_df = vect.transform(df['semantics'])
sentic_vect_df = pd.DataFrame(sentic_vect_df.toarray())
sentic_vect_df.columns = vect.get_feature_names_out()
df.reset_index(inplace=True, drop = True)
df = pd.concat([df, sentic_vect_df], axis = 1)

In [None]:
#drop redundant columns
df.drop('mood', axis = 1, inplace = True)
df.drop('semantics', axis = 1, inplace = True)
df.drop('text_list', axis = 1, inplace= True)

In [None]:
df_2 = df.copy() # copy only for 'safety' pourposes, for the second task i use the original

In [None]:
X = df_2.drop('score', axis = 1)
y = df_2['score']
y = (y < 0).astype(int)

minority = min(y.value_counts())

rus = RandomUnderSampler(sampling_strategy={0: minority, 1: minority}, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.33, random_state=42)

In [None]:
y_train = y_train.values.reshape(-1,1)
X_train.reset_index(inplace = True, drop = True)

y_test = y_test.values.reshape(-1,1)
X_test.reset_index(inplace = True, drop = True)

In [None]:
train_df = pd.concat([pd.DataFrame(y_train), X_train], axis = 1)
test_df = pd.concat([pd.DataFrame(y_test), X_test], axis = 1)

In [None]:
scaler = MinMaxScaler()
c = train_df['syno_lower_mean'].values.reshape(-1,1)
scaler.fit(c)
train_df['syno_lower_mean'] = scaler.transform(c)
test_df['syno_lower_mean'] = scaler.transform(test_df['syno_lower_mean'].values.reshape(-1,1))

scaler = MinMaxScaler()
c = train_df['syn_mean'].values.reshape(-1,1)
scaler.fit(c)
train_df['syn_mean'] = scaler.transform(c)
test_df['syn_mean'] = scaler.transform(test_df['syn_mean'].values.reshape(-1,1))

In [None]:
train_df.to_csv('dataset_first_task/traindata_frames_syno_sentic_nofeatureselection.csv', index = False)
test_df.to_csv('dataset_first_task/testdata_frames_syno_sentic_nofeatureselection.csv', index = False)

In [None]:
vect = CountVectorizer(stop_words='english', max_features = 1000)
vect.fit(X_train['text'])
train_vect_df = vect.transform(X_train['text'])
test_vect_df = vect.transform(X_test['text'])

In [None]:
train_vect_df = pd.DataFrame(train_vect_df.toarray())
train_vect_df.columns = vect.get_feature_names_out()

test_vect_df = pd.DataFrame(test_vect_df.toarray())
test_vect_df.columns = vect.get_feature_names_out()

In [None]:
train_vect_df.reset_index(inplace=True)
test_vect_df.reset_index(inplace = True)

In [None]:
train_df = pd.concat([train_df, train_vect_df], axis = 1)
test_df = pd.concat([test_df, test_vect_df], axis = 1)

In [None]:
train_df.to_csv('dataset_first_task/traindata_frames_wordvect_syno_sentic_nofeatureselection.csv', index = False)
test_df.to_csv('dataset_first_task/testdata_frames_wordvect_syno_sentic_nofeatureselection.csv', index = False)

# Feature selection

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from imblearn.under_sampling import RandomUnderSampler

In [None]:
X = train_df.drop(0, axis = 1)
X = X.iloc[:,3:] #remove text and
y = train_df[0]

In [None]:
np.random.seed(42)

feature_names = np.array(X.columns)

## random decision trees to check how many times a feature is important in the classification

In [None]:
feature_dict = {}

for i in range(200):
    crit = np.random.choice(['gini', 'entropy', 'log_loss'])
    mss = np.random.uniform(1e-2, 1e0)
    msl = np.random.uniform(0.001, 0.2)
    md=  np.random.randint(2, 200)

    DT = DecisionTreeClassifier(criterion = crit, max_depth= md, min_samples_leaf= msl, min_samples_split=mss, random_state= 42)

    threshold =  0.0001

    sfm = SelectFromModel(DT, threshold = threshold).fit(X, y)

    for i in feature_names[np.array(sfm.get_support())]:
        feature_dict[i] = feature_dict.get(i, 0) + 1

print(sorted(feature_dict.items(), key = lambda x:x[1], reverse= True))


[('literally', 130), ('syno_lower_mean', 128), ('syn_mean', 88), ('attitude', 74), ('speak', 51), ('index', 38), ('introspection', 25), ('disgust_mood', 19), ('sensitivity', 15), ('hashtag', 12), ('temper', 4), ('polarities', 2), ('fear_mood', 1), ('Questioning', 1), ('Change_direction', 1), ('Mental_property', 1), ('Taking', 1), ('Measure_volume', 1), ('anniversary', 1), ('cool', 1), ('die', 1), ('directorate', 1), ('lust', 1), ('face', 1), ('game', 1), ('jumping', 1), ('Aggregate', 1)]


In [None]:
final_df = pd.concat([y, X[list(feature_dict.keys())], train_df['text']], axis = 1)
test_df = pd.read_csv('dataset_first_task/testdata_frames_wordvect_syno_sentic_nofeatureselection.csv')
final_df.to_csv('dataset_first_task/traindata_featureselection.csv', index=False)
final_test = pd.concat([test_df['0'], test_df[list(feature_dict.keys())], test_df['text'] ], axis = 1)
final_test.to_csv('dataset_first_task/testdata_featureselection.csv', index=False)

# Data preprocessing for the second task <br>

In [None]:
df = pd.read_csv('dataset_raw/text_dataset.csv')

In [None]:
sarcastic = pd.read_table('dataset_raw/sarcasticFRAMES.tsv', header= None)
sarcastic.columns = ['id', 'score', 'frames']

In [None]:
#first standardize the id format
ids = []
for i in sarcastic['id']:
    ids.append(re.sub('traintweets.tsv:|trialtweets.tsv:','' ,i))

sarcastic['id'] = ids
#retrieve indexes for the sarcastic tweets
idxs_sar = []
for i in sarcastic['id']:
    row = np.where(df['id'] == int(i))[0]
    if len(row) > 0:
        idxs_sar.append(row[0])

In [None]:
sar_df = df.iloc[idxs_sar,:]
sar_df['label'] = [0]*sar_df.shape[0] #set label for the sarcastic tweet as 0

In [None]:
#same for ironic
iro = pd.read_table('dataset_raw/ironicFRAMES.tsv', header= None)
iro.columns = ['id', 'score', 'frames']

ids = []
for i in iro['id']:
    ids.append(re.sub('traintweets.tsv:|trialtweets.tsv:','' ,i))
iro['id'] = ids

idxs_iro = []
for i in iro['id']:
    row = np.where(df['id'] == int(i))[0]
    if len(row) > 0:
        idxs_iro.append(row[0])

iro_df = df.iloc[idxs_iro,:]
iro_df['label'] = [1]*iro_df.shape[0]

In [None]:
df_second_task = pd.concat([sar_df, iro_df], axis = 0)
df_second_task.drop(['id', 'score'], axis = 1, inplace = True)
df_second_task.shape

(2991, 723)

In [None]:
df_second_task = df_second_task.sample(frac = 1, replace=False, random_state=42)
df_second_task.columns = ['text'] + list(df_second_task.columns[1:])

## Same process for feature engineering

In [None]:
clean(df_second_task)
polarity_label, sentics_values, moodtags, all_moods, semantics = add_feature(df_second_task)

### Frames and features from the paper

In [None]:
df_second_task.columns = ['tweet'] + list(df_second_task.columns[1:])
X = df_second_task.drop('label', axis = 1)
X = X.drop(['text_list', 'frame'], axis = 1)
y = df_second_task['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
train_df = pd.concat([y_train, X_train], axis = 1)
test_df = pd.concat([y_test, X_test], axis = 1)

In [None]:
scaler = MinMaxScaler()
c = train_df['syno_lower_mean'].values.reshape(-1,1)
scaler.fit(c)
train_df['syno_lower_mean'] = scaler.transform(c)
test_df['syno_lower_mean'] = scaler.transform(test_df['syno_lower_mean'].values.reshape(-1,1))

scaler = MinMaxScaler()
c = train_df['syn_mean'].values.reshape(-1,1)
scaler.fit(c)
train_df['syn_mean'] = scaler.transform(c)
test_df['syn_mean'] = scaler.transform(test_df['syn_mean'].values.reshape(-1,1))

In [None]:
train_df.to_csv('dataset_second_task/ST_train_text_frames_syno.csv', index = False)
test_df.to_csv('dataset_second_task/ST_test_text_frames_syno.csv', index = False)

### With countvectorizer

In [None]:
X = df_second_task.drop('label', axis = 1)
X = X.drop(['text_list', 'frame'], axis = 1)
y = df_second_task['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
train_df = pd.concat([y_train, X_train], axis = 1)
test_df = pd.concat([y_test, X_test], axis = 1)

In [None]:
scaler = MinMaxScaler()
c = train_df['syno_lower_mean'].values.reshape(-1,1)
scaler.fit(c)
train_df['syno_lower_mean'] = scaler.transform(c)
test_df['syno_lower_mean'] = scaler.transform(test_df['syno_lower_mean'].values.reshape(-1,1))

scaler = MinMaxScaler()
c = train_df['syn_mean'].values.reshape(-1,1)
scaler.fit(c)
train_df['syn_mean'] = scaler.transform(c)
test_df['syn_mean'] = scaler.transform(test_df['syn_mean'].values.reshape(-1,1))

In [None]:
train_df.reset_index(inplace=True, drop = True)
test_df.reset_index(inplace=True, drop = True)

In [None]:
vect = CountVectorizer(stop_words='english', max_features = 1000)
vect.fit(train_df['tweet'])
train_vect_df = vect.transform(train_df['tweet'])
test_vect_df = vect.transform(test_df['tweet'])

In [None]:
train_vect_df = pd.DataFrame(train_vect_df.toarray())
train_vect_df.columns = vect.get_feature_names_out()

test_vect_df = pd.DataFrame(test_vect_df.toarray())
test_vect_df.columns = vect.get_feature_names_out()

In [None]:
train_vect_df.reset_index(inplace=True)
test_vect_df.reset_index(inplace = True)

In [None]:
train_df = pd.concat([train_df, train_vect_df], axis = 1)
test_df = pd.concat([test_df, test_vect_df], axis = 1)

In [None]:
train_df.to_csv('dataset_second_task/ST_train_text_frames_wordvect_syno.csv', index = False)
test_df.to_csv('dataset_second_task/ST_test_text_frames_wordvect_syno.csv', index = False)

### Sentic features

In [None]:
df_second_task['polarities'] = polarity_label
df_second_task['polarities'] = le.fit_transform(df_second_task['polarities'])
for k,v in sentics_values.items():
    df_second_task[k] = v
df_second_task['mood'] = moodtags
df_second_task['semantics'] = semantics

In [None]:
#create moodtags column
columns = dict()
for i,r in df_second_task.iterrows():
    for e in all_moods:
        if e in r['mood']:
            columns[e] = columns.get(e, []) + [1]
        else:
            columns[e] = columns.get(e, []) + [0]

for k,v in columns.items():
    df_second_task[k+'_mood'] = v

In [None]:
#use countvectorizer to create all the semantics columns
prova_sem = []
for t in df_second_task['semantics']:
    all = ''
    for v in t:
        all += v + ' '
    prova_sem.append(all)
df_second_task['semantics'] = prova_sem

vect = CountVectorizer()
vect = CountVectorizer(stop_words='english')
vect.fit(df_second_task['semantics'])
sentic_vect_df = vect.transform(df_second_task['semantics'])

sentic_vect_df = pd.DataFrame(sentic_vect_df.toarray())
sentic_vect_df.columns = vect.get_feature_names_out()

df_second_task.reset_index(inplace=True, drop = True)
df_second_task = pd.concat([df_second_task, sentic_vect_df], axis = 1)

In [None]:
df_second_task.drop('mood', axis = 1, inplace = True)
df_second_task.drop('semantics', axis = 1, inplace = True)
df_second_task.drop('text_list', axis = 1, inplace= True)

In [None]:
X = df_second_task.drop('label', axis = 1)
X = X.drop('frame', axis = 1)
y = df_second_task['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
train_df = pd.concat([y_train, X_train], axis = 1)
test_df = pd.concat([y_test, X_test], axis = 1)

In [None]:
scaler = MinMaxScaler()
c = train_df['syno_lower_mean'].values.reshape(-1,1)
scaler.fit(c)
train_df['syno_lower_mean'] = scaler.transform(c)
test_df['syno_lower_mean'] = scaler.transform(test_df['syno_lower_mean'].values.reshape(-1,1))

scaler = MinMaxScaler()
c = train_df['syn_mean'].values.reshape(-1,1)
scaler.fit(c)
train_df['syn_mean'] = scaler.transform(c)
test_df['syn_mean'] = scaler.transform(test_df['syn_mean'].values.reshape(-1,1))
train_df.reset_index(inplace=True, drop = True)
test_df.reset_index(inplace=True, drop = True)

In [None]:
vect = CountVectorizer(stop_words='english', max_features = 1000)
vect.fit(train_df['tweet'])
train_vect_df = vect.transform(train_df['tweet'])
test_vect_df = vect.transform(test_df['tweet'])
train_vect_df = pd.DataFrame(train_vect_df.toarray())
train_vect_df.columns = vect.get_feature_names_out()

test_vect_df = pd.DataFrame(test_vect_df.toarray())
test_vect_df.columns = vect.get_feature_names_out()
train_vect_df.reset_index(inplace=True)
test_vect_df.reset_index(inplace = True)
train_df = pd.concat([train_df, train_vect_df], axis = 1)
test_df = pd.concat([test_df, test_vect_df], axis = 1)

In [None]:
train_df.to_csv('dataset_second_task/ST_train_text_frames_sentic_wordvect_syno.csv', index = False)
test_df.to_csv('dataset_second_task/ST_test_text_frames_sentic_wordvect_syno.csv', index = False)

### Feature selection

In [None]:
X = train_df.iloc[:,2:] #remove text and label
y = train_df['label']

In [None]:
np.random.seed(42)

DT = DecisionTreeClassifier(random_state= 42)
feature_names = np.array(X.columns)
feature_dict = {}

for i in range(200):
    crit = np.random.choice(['gini', 'entropy', 'log_loss'])
    mss = np.random.uniform(1e-2, 1e0)
    msl = np.random.uniform(0.001, 0.2)
    md=  np.random.randint(2, 200)

    DT = DecisionTreeClassifier(criterion = crit, max_depth= md, min_samples_leaf= msl, min_samples_split=mss, random_state= 42)

    threshold =  0.0001

    sfm = SelectFromModel(DT, threshold = threshold).fit(X, y)

    for i in feature_names[np.array(sfm.get_support())]:
        feature_dict[i] = feature_dict.get(i, 0) + 1

print(sorted(feature_dict.items(), key = lambda x:x[1], reverse= True))

[('introspection', 200), ('Calendric_unit', 91), ('syn_mean', 84), ('index', 74), ('fear_mood', 26), ('temper', 23), ('proactive', 19), ('syno_lower_mean', 14), ('Awareness', 7), ('enjoy', 7), ('really', 7), ('eagerness_mood', 7), ('sensitivity', 7), ('hashtag', 4), ('love', 3), ('attitude', 2), ('eager', 2), ('im', 2), ('sadness_mood', 1), ('Change_direction', 1), ('Perception_active', 1), ('Perception_experience', 1), ('Cause_impact', 1), ('Familiarity', 1), ('Aesthetics', 1), ('Stimulus_focus', 1), ('Obviousness', 1), ('Leadership', 1), ('cloud_nine', 1), ('smile', 1), ('thanks', 1), ('thoroughly_enjoy', 1), ('called', 1), ('guy', 1), ('irony', 1), ('People', 1), ('sexuality', 1)]


In [None]:
train_df = pd.read_csv('dataset_second_task/ST_train_text_frames_sentic_wordvect_syno.csv')
final_df = pd.concat([train_df['label'], train_df[list(feature_dict.keys())], train_df['tweet']], axis = 1)
test_df = pd.read_csv('dataset_second_task/ST_test_text_frames_sentic_wordvect_syno.csv')
final_test = pd.concat([test_df['label'], test_df[list(feature_dict.keys())], test_df['tweet'] ], axis = 1)

In [None]:
final_df.to_csv('dataset_second_task/ST_traindata_featureselection.csv', index=False)
final_test.to_csv('dataset_second_task/ST_testdata_featureselection.csv', index=False)