In [1]:
import pandas as pd
import sqlite3
import emoji
from collections import Counter
from pprint import pprint

from matplotlib.pyplot import yscale, xscale, title, plot
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import jieba
import re
from collections import Counter
import jieba.analyse

In [2]:
def read_sql():
    df = __read_sql()
    df = __select_taipei_area(df)
    df = __clean_raw_dataframe(df)
    return df

# read raw data from sql
def __read_sql():
    con = sqlite3.connect('../data/InstagramPost.sqlite')
    con2 = sqlite3.connect('../data/InstagramPost2.sqlite')
    con3 = sqlite3.connect('../data/InstagramPost3.sqlite')
    SQL = pd.read_sql_query('select * from InstagramPost', con)
    SQL2 = pd.read_sql_query('select * from InstagramPost_table2', con)
    SQL3 = pd.read_sql_query('select * from InstagramPost_table3', con)
    SQL4 = pd.read_sql_query('select * from InstagramPost_table4', con)
    SQL5 = pd.read_sql_query('select * from InstagramPost_table2', con2)
    SQL6 = pd.read_sql_query('select * from InstagramPost', con3)

    # concat SQL tables
    SQL = pd.concat([SQL, SQL2, SQL3, SQL4, SQL5, SQL6]).drop_duplicates()
    del SQL2, SQL3, SQL4, SQL5, SQL6
    SQL['datetime'] = pd.to_datetime(SQL.date, unit='s')
    instagram2016 = SQL[SQL.datetime.dt.year == 2016]
    del SQL
    return instagram2016

# select taipei area
def __select_taipei_area(df):
#     # 基隆路附近
#     lat_min = 25.032840
#     lat_max = 25.051496
#     lng_min = 121.557441
#     lng_max = 121.580229
    
    # 台北市
    lat_min, lng_min = 24.975456, 121.461888
    lat_max, lng_max = 25.100281, 121.611286

    df['lat'] = df.lat.astype(float)
    df['lng'] = df.lng.astype(float)

    lat_bound = df.lat.between(lat_min, lat_max)
    lng_bound = df.lng.between(lng_min, lng_max)
    df = df[lat_bound & lng_bound]
    return df

# clean datafrome
def __clean_raw_dataframe(df):
    df = df[['location_id', 'location_name', 'lat', 'lng', 'caption', 'comments',
             'date', 'media_id', 'is_video', 'likes', 'owner_id', 'thumbnail_src',
             'display_src', 'datetime']].reset_index().drop(columns='index')
    return df

In [3]:
# add stop word for jieba
with open('add_stopword.txt') as f:
    stopword = f.read()

jieba.add_word(stopword)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ronn\AppData\Local\Temp\jieba.cache
Loading model cost 0.910 seconds.
Prefix dict has been built succesfully.


# 1. Read data

In [4]:
ig2016 = read_sql()

In [5]:
ig2016.head()

Unnamed: 0,location_id,location_name,lat,lng,caption,comments,date,media_id,is_video,likes,owner_id,thumbnail_src,display_src,datetime
0,539785069465598,駱馬休息站,25.060489,121.558019,-\n「駱馬」\n哺乳綱 偶蹄目 駱駝科 \n駱馬又名無峰駱駝，是群居的動物，非常耐旱，牠的...,15,1477227676,1367407320455448673,0,77,472063179,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,2016-10-23 13:01:16
1,1030274404,珍妮姐姐,25.03307,121.574303,4y9m31d\n🎃🎃🎃萬聖節不搗蛋要幹嘛.....來當小小波麗士啦！🚔🚔🚔一個好老成的交警...,0,1477722698,1371559858339338657,0,10,471686179,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,2016-10-29 06:31:38
2,512246272215639,明城鎖印行,25.054319,121.579559,原來嘟嘟嘴,0,1463601509,1253102745434590928,1,9,3103213202,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,2016-05-18 19:58:29
3,176805912370722,天使部屋美甲美睫,25.04213,121.54568,ชอบกิน street food กันมากกว่า เพราะรสชาติ auth...,6,1463216532,1249873319996801497,0,3115,22177415,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,2016-05-14 09:02:12
4,1030059774,上順五金,25.045666,121.572621,殺氣⋯,0,1457411803,1201179727676509838,0,5,1656340962,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,https://instagram.ftpe7-4.fna.fbcdn.net/t51.28...,2016-03-08 04:36:43


# 2. Test TF IDF

In [6]:
test = ig2016[:10]

In [7]:
jieba.analyse.set_idf_path('./idf.txt.big')

In [8]:
def df_tag(caption):
    tags = jieba.analyse.extract_tags(caption, topK=10)
    return tags

In [9]:
loc_gb = ig2016.groupby('location_name')

In [10]:
cap = loc_gb['caption'].apply(lambda x: x.sum())

In [11]:
cap.head()

location_name
(財)日本交流協会 台北事務所                  台湾で運転するために翻訳文を作ってもらった\n明日は遠くに行こう\n#台湾 #台湾で運転 #...
-東洋寶時捷-                          颱風天就是要開速霸陸啊 不然要幹嘛😏SUBARU forge engine ej25 .Po...
1/10 Cake                        覺得歲末就是胖胖胖\n跟著 @ew__33 就變這樣\n這個月無節制\n下個月不省不行了\n...
10 Square Cafe                   好朋友一定要膩在一起見面到最後一天 2016 last day in 10 square c...
1001 Nights Taipei (一千零一夜水煙館)    智障頭飾hold住整個舞池🤡🤡🤡\n-\n#happynewyear #2017 #danc...
Name: caption, dtype: object

In [14]:
igCap = pd.DataFrame(cap).reset_index()
igCap['tags'] = igCap['caption'].apply(df_tag)

In [15]:
igCap.head()

Unnamed: 0,location_name,caption,tags
0,(財)日本交流協会 台北事務所,台湾で運転するために翻訳文を作ってもらった\n明日は遠くに行こう\n#台湾 #台湾で運転 #...,"[台湾, 計画, 明日]"
1,-東洋寶時捷-,颱風天就是要開速霸陸啊 不然要幹嘛😏SUBARU forge engine ej25 .Po...,"[風天, 就是, 開速, 霸陸, 不然, SUBARU, forge, engine, ej..."
2,1/10 Cake,覺得歲末就是胖胖胖\n跟著 @ew__33 就變這樣\n這個月無節制\n下個月不省不行了\n...,"[蛋糕, 水果, 千層, 檸檬塔, 10, 台北, 巨蛋, 甜點, 美食, cake]"
3,10 Square Cafe,好朋友一定要膩在一起見面到最後一天 2016 last day in 10 square c...,"[10squarecafe, cafe, 咖啡, sweet, love, cake, de..."
4,1001 Nights Taipei (一千零一夜水煙館),智障頭飾hold住整個舞池🤡🤡🤡\n-\n#happynewyear #2017 #danc...,"[taipei, hookah, party, taiwan, life, drinks, ..."


igCap[['location_name', 'tags']].to_csv('caption_tfidf.csv', encoding='utf8')