## 1. Importing libraries

In [112]:
import matplotlib.pyplot as plt
import plotly.express as px
import json
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import dateutil.parser
from datetime import datetime


## 2. Reading data  
#### Post comments data from my Instagram

In [130]:
with open("post_comments.json", encoding='utf-8') as myFile:
  commentsData = json.load(myFile)
#printing a sample instance
#print(commentsData["comments_media_comments"][0]['string_map_data']['Comment']['value'])

## 3. Extracting comments from data
#### We are extracting only textual information from our data and ignoring timestamp parameter as of now

In [131]:
#length of the json data : number of total instagram post comments
length = len(commentsData["comments_media_comments"])
print("The number of comments from 2016 to 2022 on my Instagram : ", length)
listComments = []
# converting complex json data to a list only with post comments
for i in range(len(commentsData["comments_media_comments"])):
  #extracting only comments data (textual info) from other parameters
  temp = commentsData["comments_media_comments"][i]['string_map_data']['Comment']['value']
  #removing the emojis from our comments
  temp = temp.encode('ascii',errors='ignore').decode()
  listComments.append(temp)
  temp = ""


The number of comments from 2016 to 2022 on my Instagram :  1975


In [None]:
#generating wordcloud for Instagram Post comments
def generateWordcloud(listComments):
  unique_string=(" ").join(listComments)
  wordcloud = WordCloud(width = 1000, height = 500).generate(unique_string)
  plt.figure(figsize=(15,8))
  plt.imshow(wordcloud)
  plt.axis("off")
  #plt.savefig("wordcloud.png", bbox_inches='tight')
  plt.show()
  plt.close()
  
generateWordcloud(listComments)

#wordcloud output to be hidden due to security purpose

## 4. Extracting timestamp from the data

In [132]:
#extracting the timestamp from the comments posted from the json data
def extractTimestamp():
  listTimestamp = []
  for i in range(length):
    temp = commentsData["comments_media_comments"][i]['string_map_data']['Time']['value']
    listTimestamp.append(temp)
    temp = ""

  #converting a list to a dataframe
  dfTimestamp = pd.DataFrame(listTimestamp)

  #parsing the timestamp
  formatted_time = []
  for i in range(length):
    formatted_time.append(dateutil.parser.parse(dfTimestamp[0].iloc[i]))

  #converting a list to a dataframe
  dfTimestamp = pd.DataFrame(formatted_time)
extractTimestamp()

In [133]:
dfTimestamp[0]

0      2017-04-19 03:35:00
1      2017-04-19 02:53:00
2      2017-04-17 12:22:00
3      2017-04-14 08:40:00
4      2017-04-09 02:07:00
               ...        
1970   2021-12-20 11:49:00
1971   2021-12-12 20:53:00
1972   2021-12-12 08:51:00
1973   2021-11-26 07:15:00
1974   2021-11-17 23:49:00
Name: 0, Length: 1975, dtype: datetime64[ns]

In [143]:
#post parsing, we are extracting year, month, day and time from the timestamp
def parseTimestamp(dfTimestamp):
  global dfTimestampParsed
  year,month,day,time = ([] for i in range(4))
  for i in range(len(formatted_time)):
    year.append(dfTimestamp[0][i].strftime("%Y"))
    month.append(dfTimestamp[0][i].strftime("%m"))
    day.append(dfTimestamp[0][i].strftime("%d"))
    time.append(dfTimestamp[0][i].strftime("%H:%M:%S"))

  dfTimestampParsed = pd.DataFrame(list(zip(year, month, day, time)),columns=['year', 'month','day','time'])
  #converting month number to month names. Eg - 01 to January
  dfTimestampParsed['month'] = pd.to_datetime(dfTimestampParsed['month'], format='%m').dt.month_name().str.slice(stop=3)
  dfTimestampParsed.columns = dfTimestampParsed.columns.str.replace(' ', '')

parseTimestamp(dfTimestamp)  


In [144]:
dfTimestampParsed

Unnamed: 0,year,month,day,time
0,2017,Apr,19,03:35:00
1,2017,Apr,19,02:53:00
2,2017,Apr,17,12:22:00
3,2017,Apr,14,08:40:00
4,2017,Apr,09,02:07:00
...,...,...,...,...
1970,2021,Dec,20,11:49:00
1971,2021,Dec,12,20:53:00
1972,2021,Dec,12,08:51:00
1973,2021,Nov,26,07:15:00


## Analysis

- Most Post Comments By Year 

In [156]:
def commentsByYear(dfTimestampParsed):    
    yearly_group = dfTimestampParsed['year'].value_counts()
    fig = px.bar(yearly_group, color=yearly_group,color_continuous_scale='orrd')
    fig.update_layout(
        title = "Most Post Comments By Year",
        autosize=False,
        width=700,
        height=500,
        xaxis_title="Year",
        yaxis_title="Number of Comments"    
    )
    fig.update_traces(textposition='outside')
    fig.show()

commentsByYear(dfTimestampParsed)


- Most Post Comments By Year

In [177]:
def commentsByMonth(dfTimestampParsed):
    by_time = dfTimestampParsed['month'].value_counts().sort_values(ascending = False)
    fig = px.bar(by_time, color= by_time, color_continuous_scale='purd')
    fig.update_layout(
        title = "Most Post Comments By Month",
        autosize=False,
        width=700,
        height=500,
        xaxis_title="Month", 
        yaxis_title="Number of Comments"
        )
    fig.show()
commentsByMonth(dfTimestampParsed)

- Most Comments by Month

In [182]:
def commentsByDate(dfTimestampParsed):
    by_day = dfTimestampParsed['day'].value_counts().head(7).sort_values(ascending = True)
    fig = px.bar(by_day, orientation='h', color= by_day, color_continuous_scale='rdpu')
    fig.update_layout(
        title = "Most Post Comments By Date",
        autosize=False,
        width=700,
        height=500,
        yaxis_title="Date", 
        xaxis_title="Number of Comments"
        )
    fig.show()

commentsByDate(dfTimestampParsed)

- At what time do I comment the most?

In [183]:
def commentsbyTime(dfTimestampParsed): 
  dfActiveByTime = dfTimestampParsed['time'].str.split(":")
  timeTemp = []
  for i in range(length):
    timeTemp.append(dfActiveByTime[i][0])
  timeByHours = pd.Series(timeTemp)
  dfTimestampParsed['timeByHours'] = timeByHours

  by_hours = dfTimestampParsed['timeByHours'].value_counts().sort_values(ascending = True)
  fig1 = px.bar(by_hours, orientation='h', color= by_hours, color_continuous_scale='dense')
  fig1.update_layout(
    title = "Most Post Comments By Hours",
    autosize=False,
    width=700,
    height=500,
    xaxis_title="Number of Comments", 
    yaxis_title="Time in Hours"
    )
  fig1.show()

commentsbyTime(dfTimestampParsed)