In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import os
import time
!pip install ipython-autotime
%load_ext autotime
import warnings
warnings.filterwarnings('ignore')
from wordcloud import WordCloud, STOPWORDS
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords_tr = set(stopwords.words('turkish'))
!pip install tweet-preprocessor
import string
import re

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
df = pd.read_csv('twitter_data.csv')

print("başlangıç veri seti satır sayısı", df.shape)

df = df.drop_duplicates(subset=['Tweet Id'])
df.dropna(axis=0, how='any', subset=['Text', 'Tweet Id'], inplace=True)
df['Datetime'] = df['Datetime'].str.replace('\+00:00', '')

df['clean_text'] =  df['Text'].apply(lambda x: re.sub(r"(?:\@|#|https?\://)\S+", "", x).strip())

df = df.replace('', np.nan, regex=True)
df.dropna(axis=0, how='all', subset=['clean_text'], inplace=True)

def remove_punc(test_str):
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  for ele in test_str:  
      if ele in punc:  
          test_str = test_str.replace(ele, "")
  return test_str


df['clean_text'] =  df['clean_text'].apply(lambda x: remove_punc(x))

def remove_stopwords(text):
  from nltk.corpus import stopwords  
  from nltk.tokenize import word_tokenize  
    
  word_tokens = word_tokenize(text)  
  filtered_sentence = [w for w in word_tokens if not w in stopWords_tr]  
  filtered_sentence = []  
    
  for w in word_tokens:  
      if w not in stopWords_tr:  
          filtered_sentence.append(w)  
    
  return ' '.join(filtered_sentence)


df['clean_text'] =  df['clean_text'].apply(lambda x: remove_stopwords(x))

df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 4]))

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df_grouped_by_tag = df.groupby(by=["hashtag"], as_index=False).count()
df_grouped_by_tag

In [None]:
plt.figure(figsize=(12,8))
plt.bar(x=df_grouped_by_tag['hashtag'], height=df_grouped_by_tag['Tweet Id'])
plt.xlabel('Etiket')
plt.ylabel('Sayı')
plt.title('Kullanılan Etiket sayısı')
plt.xticks(rotation='vertical')

In [None]:
df_grouped_by_tag.set_index(df_grouped_by_tag['hashtag'], inplace=True)

plott = df_grouped_by_tag.plot.pie(y='clean_text', figsize=(10, 10))

plott.legend(loc='best')

In [None]:

import matplotlib.ticker as ticker
df['Datetime'] = pd.to_datetime(df['Datetime'])


In [None]:
all_text = df['clean_text'].values
wc2 = WordCloud(width = 3000, height = 2000, random_state=1, background_color='salmon', colormap='viridis', collocations=False, stopwords = stopWords_tr).generate(text=str(all_text))
plt.figure(figsize=(12, 8))
plt.imshow(wc2)
plt.axis("off");

In [None]:
df['mentions'] =  df['Text'].apply(lambda x: re.findall('\s([@][\w_-]+)', str(x)))
df['hashtags'] =  df['Text'].apply(lambda x: re.findall('\s([#][\w_-]+)', str(x)))
df['urls'] =  df['Text'].apply(lambda x: re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(x)))

df['mention_count'] =  df['Text'].apply(lambda x: len(re.findall('\s([@][\w_-]+)', str(x))))
df['hashtag_count'] =  df['Text'].apply(lambda x: len(re.findall('\s([#][\w_-]+)', str(x))))
df['url_count'] =  df['Text'].apply(lambda x: len(re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(x))))

df['dirty_text_word_count'] = df['Text'].apply(lambda x: len(str(x).split()))
df['clean_text_word_count'] = df['clean_text'].apply(lambda x: len(str(x).split()))
df['unique_word_count'] = df['clean_text'].apply(lambda x: len(set(str(x).split())))
df['stop_word_count'] = df['clean_text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopWords_tr]))
#df['url_count'] = df['Text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
df['mean_word_length'] = df['clean_text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df['dirty_text_char_count'] = df['Text'].apply(lambda x: len(str(x)))
df['clean_text_char_count'] = df['clean_text'].apply(lambda x: len(str(x)))
df['punctuation_count'] = df['clean_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
df['difference_in_words'] = df['dirty_text_word_count'] - df['clean_text_word_count']

In [None]:
from collections import defaultdict

def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in stopWords_tr]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

N = 10

# Unigrams
unigrams = defaultdict(int)

for tweet in df['clean_text']:
    for word in generate_ngrams(tweet):
        unigrams[word] += 1
        
df_unigrams = pd.DataFrame(sorted(unigrams.items(), key=lambda x: x[1])[::-1])

# Bigrams
bigrams = defaultdict(int)

for tweet in df['clean_text']:
    for word in generate_ngrams(tweet, n_gram=2):
        bigrams[word] += 1
        
df_bigrams = pd.DataFrame(sorted(bigrams.items(), key=lambda x: x[1])[::-1])

# Trigrams
trigrams = defaultdict(int)

for tweet in df['clean_text']:
    for word in generate_ngrams(tweet, n_gram=3):
        trigrams[word] += 1
        
df_trigrams = pd.DataFrame(sorted(trigrams.items(), key=lambda x: x[1])[::-1])

In [None]:
import seaborn as sns

fig, axes = plt.subplots()
plt.tight_layout()

sns.barplot(y=df_unigrams[0].values[:N], x=df_unigrams[1].values[:N], color='red')

for i in range(1):
    axes.set_xlabel('')
    axes.set_ylabel('')
    axes.tick_params(axis='x', labelsize=15)
    axes.tick_params(axis='y', labelsize=15)

plt.show()

In [None]:
df_unigrams

In [None]:
fig, axes = plt.subplots()
plt.tight_layout()

sns.barplot(y=df_bigrams[0].values[:N], x=df_bigrams[1].values[:N], color='red')

for i in range(1):
    axes.set_xlabel('')
    axes.set_ylabel('')
    axes.tick_params(axis='x', labelsize=15)
    axes.tick_params(axis='y', labelsize=15)

plt.show()

In [None]:
fig, axes = plt.subplots()
plt.tight_layout()

sns.barplot(y=df_trigrams[0].values[:N], x=df_trigrams[1].values[:N], color='red')

for i in range(1):
    axes.set_xlabel('')
    axes.set_ylabel('')
    axes.tick_params(axis='x', labelsize=15)
    axes.tick_params(axis='y', labelsize=15)

plt.show()

In [None]:
df_groupby_username = df.groupby(by=["Username"], as_index=False).count()

In [None]:
df_groupby_username2 = df_groupby_username.sort_values(by='Tweet Id', ascending=False).head(10)

fig, axes = plt.subplots()
plt.tight_layout()

sns.barplot(y=df_groupby_username2['Username'], x=df_groupby_username2['Tweet Id'])

axes.set_xlabel('')
axes.set_ylabel('')
axes.tick_params(axis='x', labelsize=15)
axes.tick_params(axis='y', labelsize=15)

In [None]:
import matplotlib 
matplotlib.rc('xtick', labelsize=20) 
matplotlib.rc('ytick', labelsize=20) 


df['Datetime'] = pd.to_datetime(df['Datetime'])

df['month'] = df['Datetime'].dt.month

df_groupby_month = df.groupby(by=["month"], as_index=False).count()

pie, ax = plt.subplots(figsize=[20,15])
labels = ['Ekim', 'Kasım']
plt.pie(x=df_groupby_month['Tweet Id'], autopct="%.1f%%",  labels=labels, pctdistance=0.5, textprops={'fontsize': 20})

In [None]:
df['Datetime'] = pd.to_datetime(df['Datetime'])

df['day'] = df['Datetime'].dt.day

df_groupby_day = df.groupby(by=["day"], as_index=False).count()
df_groupby_day.sort_values(by='Tweet Id', ascending=False, inplace=True)


df_groupby_day[['day', 'Tweet Id']]

days = df_groupby_day['day']

fig, axes = plt.subplots()
plt.tight_layout()

plt.bar(x=df_groupby_day.sort_values(by='Tweet Id', ascending=False)['day'], height=df_groupby_day['Tweet Id'])

axes.set_xlabel('')
axes.set_ylabel('')
axes.tick_params(axis='x', labelsize=15)
axes.tick_params(axis='y', labelsize=15)

In [None]:
df['Datetime'] = pd.to_datetime(df['Datetime'])

df['hour'] = df['Datetime'].dt.hour

df_groupby_hour = df.groupby(by=["hour"], as_index=False).count()
df_groupby_hour

fig, axes = plt.subplots()
plt.tight_layout()

labels = df_groupby_hour['hour']
plt.bar(df_groupby_hour['hour'], df_groupby_hour['Tweet Id'])

plt.xticks(df_groupby_hour['hour'], labels, rotation='vertical')

plt.subplots_adjust(bottom=0.15)
plt.show()

In [None]:
df.head()

In [None]:
df['hashtag_count2'] = df['Text'].apply(lambda x: len(re.findall('[\s\W]#(\w+)', str(x))))


In [None]:
df[df['hashtag_count2'] == 0]

In [None]:
df_grouped_by_hashtag_count = df.groupby(by=["hashtag_count"], as_index=False).count()
df_grouped_by_hashtag_count

In [None]:
df_grouped_by_url_count = df.groupby(by=["url_count"], as_index=False).count()
df_grouped_by_url_count


plt.figure(figsize=(12,8))
df_grouped_by_url_count.set_index(df_grouped_by_url_count['url_count'], inplace=True)

plott = df_grouped_by_url_count.plot.pie(y='clean_text', autopct='%1.1f%%',figsize=(10, 10),  pctdistance=0.5, textprops={'fontsize': 20})

In [None]:
df_grouped_by_mention_count = df.groupby(by=["mention_count"], as_index=False).count()
df_grouped_by_mention_count


plt.figure(figsize=(12,8))
df_grouped_by_mention_count.set_index(df_grouped_by_mention_count['mention_count'], inplace=True)

plott = df_grouped_by_mention_count.plot.pie(y='clean_text', figsize=(10, 10))

In [None]:
df.head()

In [None]:
import seaborn as sns


def listToString(s):  

   mymy = re.findall('\s([@][\w_-]+)', s)
   return ' '.join(mymy)

    
gg = df['Text'].apply(lambda x: listToString(x))

hh = pd.Series(' '.join(gg).split()).value_counts()


In [None]:
hh = pd.DataFrame(hh)

mention_values = [hh.iloc[0].values[0], hh.iloc[1].values[0], hh.iloc[2].values[0], hh.iloc[3].values[0], hh.iloc[4].values[0],
                  hh.iloc[5].values[0], hh.iloc[6].values[0], hh.iloc[7].values[0], hh.iloc[8].values[0], hh.iloc[9].values[0]]

sns.barplot(y=hh.head(10).index, x=mention_values)

In [None]:
df.info()