In [1]:
import pandas as pd
import string
import geonamescache
import re
import nltk
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:

tw_mitaka_a = pd.read_csv("tw_mitaka_907.csv")
tw_mitaka_1 = pd.read_csv("tw_mitaka1.csv")
x = pd.concat([tw_mitaka_a, tw_mitaka_1]).drop_duplicates()
x.to_csv('tw_mitaka1.csv', index=False)


In [3]:
tw_mitaka = pd.read_csv("tw_mitaka1.csv")
tw_mitaka['created_at'] =  pd.to_datetime(tw_mitaka['created_at'])

In [4]:
tw_mitaka.shape

(1963, 7)

In [5]:
tw_mitaka.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1963 entries, 0 to 1962
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   tweet_id    1963 non-null   int64              
 1   username    1963 non-null   object             
 2   text        1963 non-null   object             
 3   created_at  1963 non-null   datetime64[ns, UTC]
 4   location    1268 non-null   object             
 5   source      1963 non-null   object             
 6   url         1963 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(1), object(5)
memory usage: 107.5+ KB


In [6]:
tw_mitaka.head()

Unnamed: 0,tweet_id,username,text,created_at,location,source,url
0,1562652547726606336,retromanjapan,The Showa Retro In #MITAKA #Tokyo #26: To The ...,2022-08-25 04:06:14+00:00,"Tokyo, Japan",Revive Social App,https://twitter.com/retromanjapan/status/15626...
1,1562619643272892416,soubusen7,The next station is MITAKA,2022-08-25 01:55:29+00:00,千葉～三鷹,twittbot.net,https://twitter.com/soubusen7/status/156261964...
2,1562596747305558020,GBxredom,Hmm some Asa Mitaka for now https://t.co/btf6V...,2022-08-25 00:24:30+00:00,,Twitter for iPad,https://twitter.com/GBxredom/status/1562596747...
3,1562592648229113858,retromanjapan,The Showa Retro In #MITAKA #Tokyo #6: A Row Ho...,2022-08-25 00:08:13+00:00,"Tokyo, Japan",Revive Social App,https://twitter.com/retromanjapan/status/15625...
4,1562567606762029058,alexisashin,what really happened to asa mitaka https://t.c...,2022-08-24 22:28:43+00:00,,Twitter for iPhone,https://twitter.com/alexisashin/status/1562567...


In [7]:
#Stop words, links, emotes, non-ascii characters are removed.
stops = set(stopwords.words('english'))

for i in range(len(tw_mitaka['text'])):
    tweet_text = []
    #https://stackoverflow.com/a/24399874
    tw_mitaka['text'][i] = re.sub(r'[^\x00-\x7f]',r'', tw_mitaka['text'][i])
    #https://stackoverflow.com/a/65330690
    tw_mitaka['text'][i] = re.sub(r"http\S+", "", tw_mitaka['text'][i])
    
    words = tw_mitaka['text'][i].lower().split()
    words = [''.join(letter for letter in word if letter not in (string.punctuation)) for word in words]
    for word in words:
        if word not in stops:
            tweet_text.append(word)

    tw_mitaka['text'].iloc[i] = ' '.join(tweet_text)

In [33]:
print(tw_mitaka['text'][1000])

drew another war devil illustration celebrate latest trailer wanna draw power next   chainsawman 


In [9]:
#Word frequency dict
word_freq = {}

for text_list in tw_mitaka['text']:
    for word in text_list.split():
        if word not in word_freq:
            word_freq[word] = 0
        word_freq[word] += 1

In [10]:
dict(list(word_freq.items())[0:5])

{'showa': 129, 'retro': 119, 'mitaka': 931, 'tokyo': 113, '26': 3}

In [11]:
#Creating dataframe from word freq dict
df_wordfreq = pd.DataFrame.from_dict(word_freq, orient='index', columns=['Freq'])

In [35]:
df_wordfreq.sort_values('Freq', ascending=False).head(5)

Unnamed: 0,Freq
devil,943
war,936
mitaka,931
chainsawman,502
asa,404


#### VADER Sentimental Analysis

In [13]:
#Sentimental analysis function with VADER
def sentiment_scores(sentence):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
    if sentiment_dict['compound'] >= 0.05 :
        return("Positive")
    elif sentiment_dict['compound'] <= -0.05 :
        return("Negative")
    else :
        return("Neutral")

In [14]:
#Adding score column for sentiment analysis
tw_mitaka_vader = tw_mitaka.copy()

In [15]:
#VADER Sentimental analysis on text. 
score_list = []
for i in tw_mitaka_vader['text']:
    score = sentiment_scores(i)
    score_list.append(score)

In [16]:
tw_mitaka_vader['score'] = score_list

In [17]:
tw_mitaka_vader.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1963 entries, 0 to 1962
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   tweet_id    1963 non-null   int64              
 1   username    1963 non-null   object             
 2   text        1963 non-null   object             
 3   created_at  1963 non-null   datetime64[ns, UTC]
 4   location    1268 non-null   object             
 5   source      1963 non-null   object             
 6   url         1963 non-null   object             
 7   score       1963 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(1), object(6)
memory usage: 122.8+ KB


### Can see that most tweets are "negative" according to the algorithm, but I want to say that's mostly due to the language of the queried keywords in the first place.

In [40]:
print(tw_mitaka_vader.groupby('score').size())

score
Negative    1091
Neutral      487
Positive     385
dtype: int64


In [19]:
print(tw_mitaka.groupby('username').size().sort_values(ascending=False)[0:10])

username
retromanjapan     102
soubusen7          28
loveabyssal        27
Rod_anderis        21
JRE_Chuo_E         16
JRE_Sobu_E         15
AirRapid_HE433     12
J1_Travel          12
lonehuman123       11
dnbooru            11
dtype: int64


In [20]:
print(tw_mitaka.groupby('location').size().sort_values(ascending=False)[0:10])

location
Tokyo, Japan                      105
千葉～三鷹                              28
ic: mazamuno → qt count: 49        27
she/her                            18
Tokyo                              18
Tokyo-to, Japan                    14
❮SY13❯舞子公園(兵庫県 神戸市)                12
dying non human innocent Earth     11
aroace - 23 - rey                   9
Malaysia                            8
dtype: int64


In [21]:
print(tw_mitaka.groupby('source').size().sort_values(ascending=False)[0:5])

source
Twitter for Android    601
Twitter for iPhone     573
Twitter Web App        459
Revive Social App      102
Twitter for iPad        54
dtype: int64


In [22]:
tw_mitaka['location'].nunique()

791

In [23]:
loc_freq = {}
for i in tw_mitaka['location']:
    i = str(i).lower()
    if i == 'nan':
        pass
    else:
        location_split = i.split()
        for word in location_split:
            word = re.sub(r'[^\w\s]', '', word)
            if word.isnumeric() == True:
                continue
            elif word not in loc_freq:
                loc_freq[word] = 0
            loc_freq[word]+=1

In [24]:
loc_df = pd.DataFrame(loc_freq.values(), index = loc_freq.keys(), columns=['Frequency'])
loc_df = loc_df.drop('')

In [25]:
loc_df.sort_values(by = 'Frequency', ascending=False)[0:10]

Unnamed: 0,Frequency
japan,127
tokyo,123
sheher,107
ic,49
hehim,34
the,34
in,33
count,28
千葉三鷹,28
mazamuno,27


In [26]:
for i in loc_df.index[0:10]:
    print(i)

tokyo
japan
千葉三鷹
paddys
pub
heskull
sheher
bi
marins
house


In [27]:
gc = geonamescache.GeonamesCache()
countries = gc.get_countries()
cities = gc.get_cities()
states = gc.get_us_states()

#Countries
country_list = []
for country in countries.values():
    for location in loc_df.index:
        if location.lower() == country['iso'].lower():
            country_list.append(location.lower())
        elif location.lower() == country['name'].lower():
            country_list.append(location.lower())
        elif location.lower() == country['capital'].lower():
            country_list.append(location.lower())
        else:
            continue
#US States
'''
for k, v in states.items():
    print(v['code'], v['name'])
'''

"\nfor k, v in states.items():\n    print(v['code'], v['name'])\n"

In [28]:
#Cities
city_list = []
for city in cities.values():
    print(city['alternatenames'])

["Ehskal'des-Ehndzhordani", 'Escaldes', 'Escaldes-Engordany', 'Les Escaldes', 'esukarudesu=engorudani jiao qu', 'lai sai si ka er de-en ge er da', 'Эскальдес-Энджордани', 'エスカルデス＝エンゴルダニ教区', '萊塞斯卡爾德-恩戈爾達', '萊塞斯卡爾德－恩戈爾達']
['ALV', 'Ando-la-Vyey', 'Andora', 'Andora la Vela', 'Andora la Velja', "Andora lja Vehl'ja", 'Andoro Malnova', 'Andorra', 'Andorra Tuan', 'Andorra a Vella', 'Andorra la Biella', 'Andorra la Vella', 'Andorra la Vielha', 'Andorra-a-Velha', "Andorra-la-Vel'ja", 'Andorra-la-Vielye', 'Andorre-la-Vieille', 'Andò-la-Vyèy', 'Andòrra la Vièlha', 'an dao er cheng', 'andolalabeya', 'andwra la fyla', 'Ανδόρρα', 'Андора ла Веля', 'Андора ла Веља', 'Андора ля Вэлья', 'Андорра-ла-Велья', 'אנדורה לה וולה', 'أندورا لا فيلا', 'አንዶራ ላ ቬላ', 'アンドラ・ラ・ヴェリャ', '安道爾城', '안도라라베야']
['Oumm al Qaiwain', 'Oumm al Qaïwaïn', 'Um al Kawain', 'Um al Quweim', 'Umm Al Quwain City', 'Umm al Qaiwain', 'Umm al Qawain', 'Umm al Qaywayn', 'Umm al-Quwain', "Umm-ehl'-Kajvajn", 'Yumul al Quwain', 'am alqywyn', 'mdy

In [29]:
'''
Next Steps:
Create visualizations of findings.
Time, Source, Score, username maybe (who's tweeting the most?), location (need to clean)

Visualizations:

Word Frequency.
VADER Sentimental Analysis - # of Positives and # negatives
	Provide an example tweet for each.

Username frequency
Location frequency 
Source frequency
Time of tweets (using SQL is best) - Most active time?
'''

"\nNext Steps:\nCreate visualizations of findings.\nTime, Source, Score, username maybe (who's tweeting the most?), location (need to clean)\n\nVisualizations:\n\nWord Frequency.\nVADER Sentimental Analysis - # of Positives and # negatives\n\tProvide an example tweet for each.\n\nUsername frequency\nLocation frequency \nSource frequency\nTime of tweets (using SQL is best) - Most active time?\n"