## SOCIAL MEDIA SENTIMENTS ANALYSIS DATASET - SOSYAL MEDYA DUYGU ANALİZİ VERİ SETİ

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn

In [2]:
dataFrame=pd.read_csv("https://raw.githubusercontent.com/ilay-dncblk/DeusAICommonRepo/Sena-Aziz/SenaVeAzizProje/sentimentdataset.csv")

In [3]:
dataFrame.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


## Removing columns containing numeric data - Sayısal veri içeren sütunları kaldırma

In [4]:
dataFrame = dataFrame.drop(["Unnamed: 0.1", "Unnamed: 0", "Timestamp", "Retweets", "Likes", "Year", "Month", "Day", "Hour"], axis=1)

In [5]:
dataFrame.head()

Unnamed: 0,Text,Sentiment,User,Platform,Hashtags,Country
0,Enjoying a beautiful day at the park! ...,Positive,User123,Twitter,#Nature #Park,USA
1,Traffic was terrible this morning. ...,Negative,CommuterX,Twitter,#Traffic #Morning,Canada
2,Just finished an amazing workout! 💪 ...,Positive,FitnessFan,Instagram,#Fitness #Workout,USA
3,Excited about the upcoming weekend getaway! ...,Positive,AdventureX,Facebook,#Travel #Adventure,UK
4,Trying out a new recipe for dinner tonight. ...,Neutral,ChefCook,Instagram,#Cooking #Food,Australia


## Writing words in lower case - Kelimeleri küçük harfle yazma

In [6]:
dataFrame["Text"] = dataFrame["Text"].str.lower()

In [7]:
dataFrame["Sentiment"] = dataFrame["Sentiment"].str.lower()

In [8]:
dataFrame["User"] = dataFrame["User"].str.lower()

In [9]:
dataFrame["Platform"] = dataFrame["Platform"].str.lower()

In [10]:
dataFrame["Hashtags"] = dataFrame["Hashtags"].str.lower()

In [11]:
dataFrame["Country"] = dataFrame["Country"].str.lower()

In [12]:
dataFrame.head()

Unnamed: 0,Text,Sentiment,User,Platform,Hashtags,Country
0,enjoying a beautiful day at the park! ...,positive,user123,twitter,#nature #park,usa
1,traffic was terrible this morning. ...,negative,commuterx,twitter,#traffic #morning,canada
2,just finished an amazing workout! 💪 ...,positive,fitnessfan,instagram,#fitness #workout,usa
3,excited about the upcoming weekend getaway! ...,positive,adventurex,facebook,#travel #adventure,uk
4,trying out a new recipe for dinner tonight. ...,neutral,chefcook,instagram,#cooking #food,australia


## Remove punctuation marks - Noktalama işaretlerini kaldırma

In [13]:
import re

In [14]:
dataFrame["Text"] = dataFrame["Text"].str.strip()

In [15]:
punctuation_pattern = re.compile(r"[^\w\s]", re.UNICODE)
dataFrame["Text"] = dataFrame["Text"].apply(lambda x: punctuation_pattern.sub("", x))

In [16]:
punctuation_pattern = re.compile(r"[^\w\s]", re.UNICODE)
dataFrame["Hashtags"] = dataFrame["Hashtags"].apply(lambda x: punctuation_pattern.sub("", x))

In [17]:
dataFrame.head()

Unnamed: 0,Text,Sentiment,User,Platform,Hashtags,Country
0,enjoying a beautiful day at the park,positive,user123,twitter,nature park,usa
1,traffic was terrible this morning,negative,commuterx,twitter,traffic morning,canada
2,just finished an amazing workout,positive,fitnessfan,instagram,fitness workout,usa
3,excited about the upcoming weekend getaway,positive,adventurex,facebook,travel adventure,uk
4,trying out a new recipe for dinner tonight,neutral,chefcook,instagram,cooking food,australia


## Data Augmentation Process - Veri Arttırma İşlemi

In [18]:
import nltk
from nltk.tokenize import word_tokenize
import random
from nltk.corpus import stopwords
from nltk.corpus import wordnet
#nltk.download('averaged_perceptron_tagger_eng')

In [19]:
#cümleleri token haline getirme
kelime=word_tokenize(dataFrame.Text[0])
print(kelime)

['enjoying', 'a', 'beautiful', 'day', 'at', 'the', 'park']


In [20]:
stop_words=list(stopwords.words('english'))
#stopwords = tek başına etkisiz olan kelimelerdir. bunları temel model eğitimlerinde çıkarırız.
stop_words[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [21]:
#gereksiz olan stopwordleri kaldırıyoruz
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

dataFrame['Text'] = dataFrame['Text'].apply(remove_stopwords)
dataFrame

Unnamed: 0,Text,Sentiment,User,Platform,Hashtags,Country
0,enjoying beautiful day park,positive,user123,twitter,nature park,usa
1,traffic terrible morning,negative,commuterx,twitter,traffic morning,canada
2,finished amazing workout,positive,fitnessfan,instagram,fitness workout,usa
3,excited upcoming weekend getaway,positive,adventurex,facebook,travel adventure,uk
4,trying new recipe dinner tonight,neutral,chefcook,instagram,cooking food,australia
...,...,...,...,...,...,...
727,collaborating science project received recogni...,happy,scienceprojectsuccesshighschool,facebook,sciencefairwinner highschoolscience,uk
728,attending surprise birthday party organized fr...,happy,birthdaypartyjoyhighschool,instagram,surprisecelebration highschoolfriendship,usa
729,successfully fundraising school charity initia...,happy,charityfundraisingtriumphhighschool,twitter,communitygiving highschoolphilanthropy,canada
730,participating multicultural festival celebrati...,happy,multiculturalfestivaljoyhighschool,facebook,culturalcelebration highschoolunity,uk


In [22]:
#kelimelerin eş anlamlıları ve halleri 
ornek1 = wordnet.synsets("walk") 
ornek1 #örneğin kelimenin  1. 2. veya 3. numaraları anlamlarını ve noun,verb,adjective vs. olma durumunu gösterir

[Synset('walk.n.01'),
 Synset('base_on_balls.n.01'),
 Synset('walk.n.03'),
 Synset('walk.n.04'),
 Synset('walk.n.05'),
 Synset('walk.n.06'),
 Synset('walk_of_life.n.01'),
 Synset('walk.v.01'),
 Synset('walk.v.02'),
 Synset('walk.v.03'),
 Synset('walk.v.04'),
 Synset('walk.v.05'),
 Synset('walk.v.06'),
 Synset('walk.v.07'),
 Synset('walk.v.08'),
 Synset('walk.v.09'),
 Synset('walk.v.10')]

In [23]:
print(ornek1[0].definition()) #kelimenin tanımı

the act of traveling by foot


In [24]:
ornek1 = wordnet.synset("walk.n.01") 
ornek2 = wordnet.synset("run.n.01")
print("Benzerlik: " + str(ornek1.wup_similarity(ornek2)))
#iki kelime arasındaki benzerliği gösterir örneğin walk ve run kelimlerinin isim hallerinin benzerliği

Benzerlik: 0.5714285714285714


In [25]:
es_anlamli_kelimeler=list() #eş anlamlı kelimeleri bulma
for i in wordnet.synsets("run"):
    es_anlamli_kelimeler.append(i.lemma_names()[0])
print(set(es_anlamli_kelimeler))

{'play', 'ladder', 'footrace', 'operate', 'test', 'carry', 'function', 'hunt', 'scat', 'melt', 'range', 'race', 'tend', 'guide', 'discharge', 'campaign', 'rivulet', 'run', 'prevail', 'political_campaign', 'streak', 'ply', 'move'}


In [26]:
#pos_tag modülü ile kelimelerin türünü gösterir. Örneğin day: NN(isim), beautiful:JJ(sıfat)...
cumle = "enjoying a beautiful day at the park"
kelimeler = nltk.word_tokenize(cumle)
pos_tags = nltk.pos_tag(kelimeler)
for kelimler, tag in pos_tags:
    print(f"{kelimler}: {tag}")

enjoying: VBG
a: DT
beautiful: JJ
day: NN
at: IN
the: DT
park: NN


In [27]:
#üsteki pos türlerini wordnet kütüphanesi türüne çeviriyoruz
def wordnet_pos_cevir(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    else:
        return None

In [28]:
#sadece sıfat olan kelimeleri wordnet verisetindeki eş anlamları ile değiştiriyoruz
def sifatlari_es_anlamlarina_cevir(cumle):
    kelimeler = nltk.word_tokenize(cumle)
    pos_tags = nltk.pos_tag(kelimeler)
    
    yeni_cumle = []
    
    for kelime, tag in pos_tags:
        wordnet_pos = wordnet_pos_cevir(tag)
        #wordnette eş anlamlısı var ise döngüler çalışır ve eş anlamlılarrdan birini ekler
        if wordnet_pos == wordnet.ADJ:
            synsets = wordnet.synsets(kelime, pos=wordnet_pos)
            if synsets:
                synonyms = [lemma.name() for synset in synsets for lemma in synset.lemmas()]
                if synonyms:
                    synonym = random.choice(synonyms)
                    yeni_cumle.append(synonym)
                else:
                    yeni_cumle.append(kelime)
            else:
                yeni_cumle.append(kelime)
        else:
            yeni_cumle.append(kelime)
    
    return ' '.join(yeni_cumle)

cumle = dataFrame.Text[5]
yeni_cumle = sifatlari_es_anlamlarina_cevir(cumle)
print(f"Yeni cümle: {yeni_cumle}")

Yeni cümle: feeling grateful little things life


In [29]:
#örnek olarak sıfatları değişmiş cümleler
for i in range(len(dataFrame.Text)):
    print(sifatlari_es_anlamlarina_cevir(dataFrame.Text[i]))

enjoying beautiful day park
traffic terrible morning
finished amazing workout
excited approaching weekend getaway
trying new recipe dinner tonight
feeling grateful small things life
rainy days call cosy blankets hot cocoa
raw movie release mustwatch
political discussions heating timeline
missing summer vibes beach days
published fresh blog post check
feeling bit weather today
exploring citys obscure gems
new year New fitness goals
technology changing way live
reflecting past looking ahead
adopted cute furry friend
latenight gaming session friends
attending virtual conference ai
winter blues got feeling low-toned
sipping coffee enjoying well book
exploring world virtual reality
rich day ticking todo list
finished challenging workout routine
celebrating milestone work
sunday brunch friends
learning raw language personal growth
hushed evening good book
reflecting importance mental health
new painting progress
weekend road trip explore scenic views
enjoying cup tea watching sunset
coding f

### Değişimler
#### traffic terrible morning --> traffic frightful morning 

#### feeling grateful little things life	--> feeling grateful petty things life

##### ( her yeniden çalıştırmada cümleler değişir )

In [30]:
#es anlamli olan kelimeleri veri setine ekliyoruz bu işlemi tekrar sayısı kadar birden çok kez yapıyoruz ki daha fazla cümle üretilsin
def es_anlamlilari_verisetine_ekle(dataframe, text, sentiment, tekrar):
    for _ in range(tekrar):
        yeni_satirlar = []

        for i in range(len(dataframe)):
            cumle = dataframe.iloc[i][text]
            duygu = dataframe.iloc[i][sentiment]
            yeni_cumle = sifatlari_es_anlamlarina_cevir(cumle)
            
            yeni_satirlar.append({text: yeni_cumle, sentiment: duygu})

        #oluşturdğumuz veri setlerini birleştirme işlemi
        new_df = pd.DataFrame(yeni_satirlar)
        dataframe = pd.concat([dataframe, new_df], ignore_index=True)

        #tekrar eden öğeleri kaldırma işlemi
        dataframe = dataframe.drop_duplicates(subset=text, keep='first')
    
    return dataframe


In [31]:
#text ve sentiment sutünlarını yeni dataframe ekliyoruz
yeni_dataframe = es_anlamlilari_verisetine_ekle(dataFrame, "Text", "Sentiment", tekrar=5)


In [32]:
#ana veri setindeki 732 satırlık yorum 3000+ satıra çıkarılmıştır.
yeni_dataframe[["Text","Sentiment"]]

Unnamed: 0,Text,Sentiment
0,enjoying beautiful day park,positive
1,traffic terrible morning,negative
2,finished amazing workout,positive
3,excited upcoming weekend getaway,positive
4,trying new recipe dinner tonight,neutral
...,...,...
4513,tearaway delight world brainish fairy tales,whimsy
4515,credits roll silent sense nostalgia washes red...,nostalgia
4519,visited art gallery appreciating brushstrokes ...,joy
4523,got bland tire way of_import meeting talk seri...,bad


In [33]:
#yeni veri setini kayıt için
#yeni_dataframe.to_csv('yeni_veri_seti.csv', index=False)


## ÖZET: Veri temizleme işlemi yapıldıktan sonra metin verilerinin modeller tarafından daha iyi öğrenilmesi için veri çoğaltma işlemi yapılır. Bu projede cümlelerin eş anlamlıları eklenerek veri miktarı artırılmıştır. Bu sayede basit düzeydeki modeller daha çok veri ile öğrenme yapabilir.