# Explorary Analysis

### You can find these stuffs in this part:
1) __preliminary graph statistical analysis__
#1 total number of edges 
#2 nodes
#3 strongest connected component 
#4 average degree of the node 
#5 average path length
#6 diameter 
#7 clustering coefficient

2) __detecting the strength of the video categories by finding out which categories of videos had the__
#8 highest number of videos
#9 highest number of views 
#10 highest comments

3) __detecting Influential Video Uploaders for finding out which uploader with had the__ 
#11 highest number of videos
#12 highest number of views 
#13 highest subscribers

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1-Load Data

In [2]:
import io
import os
import csv

def load_data(path):
    with open(path,'r') as f:
        reader = csv.reader(f)
        txt = list(reader)
    txt = [element[0].split('\t') for element in txt]
    return txt

data_path = "/home/silvia/文档/DSBA/NGSA/Final Project/data"

In [3]:
# txt0, txt1, txt2, txt3 is the 1st depth, 2nd depth, 3th depth and 4th depth BFS respectively.
# here we used the data collected at 2007-03-02, about 40k distinct videos in all 3 depth datasets.
# There are 10324 raws of data.

txt1 = load_data(os.path.join(data_path, '0302/0.txt'))
txt2 = load_data(os.path.join(data_path, '0302/1.txt'))
txt3 = load_data(os.path.join(data_path, '0302/2.txt'))
data =  txt1 + txt2 + txt3
total = data[0:5000] # using just 5k videos....limited by the computer computing capacity

In [4]:
# names for each column: 
# [video ID, uploader, age, category, length, views, rate, ratings, comments]
node_info = [element[0:9] for element in total]

# the rest columns are related video id of source video
related_id = [element[10:] for element in total]

# returns all the source video ids in the dataset, they are nodes in the graph
ids = [element[0] for element in total]

## 2-Graph analysis

### Build graph

In [5]:
from itertools import combinations
nodes_pair = list(combinations(ids,2))
# np.savetxt('nodes_pair.txt', nodes_pair)

# then we gonna see whether the nodes pair connect with each other or not.
# for a nodes pair (a,b), if nodes b appears in the related_id of nodes a, then a and b are connected.

id1 = [ x for x,y in nodes_pair] # the source video id
id2 = [ y for x,y in nodes_pair] # the target video id

# create a id:related_id dictionary 
related_id_dict = {k:v for k,v in zip(ids,related_id)}

# to decide whether two nodes in a node pair connect or not
connect = [1 if y not in related_id_dict[x] else 0 for x,y in nodes_pair]
pair_connect_dict = {k:v for k,v in zip(nodes_pair,connect)} # build a nodes_pair:connect dictionary

edges = [nodes_pair for nodes_pair in nodes_pair if pair_connect_dict[nodes_pair] == 1]

In [None]:
# Now we can create the graph, here we use igraph

import igraph

# create raw empty undirected graph
g = igraph.Graph(directed=False)

# add vertices
g.add_vertices(ids)

# add edges
g.add_edges(edges)

In [6]:
# because it would be easier to use networkx to analyse some features, so we also build a graph using networkx

import networkx as nx

# create an empty graph
G = nx.Graph()

#add nodes
G.add_nodes_from(ids)

#add edges
G.add_edges_from(edges)

### Explore the properties

In [None]:
# total number of edges(using igraph)
print(len(edges))

# __output__: 12480152

In [None]:
# to see the degree distribution (using igraph)
print(g.degree_distribution(bin_width=2))

# __output__:
# N = 5000, mean +- sd: 4992.0608 +- 6.0542
# Each * represents 19 items
# [4970, 4972):  (2)
# [4972, 4974):  (4)
# [4974, 4976): * (24)
# [4976, 4978): * (19)
# [4978, 4980): ****** (116)
# [4980, 4982): ************* (248)
# [4982, 4984): ************* (253)
# [4984, 4986): ************** (282)
# [4986, 4988): ************* (255)
# [4988, 4990): ************** (283)
# [4990, 4992): ***************** (336)
# [4992, 4994): *********************** (442)
# [4994, 4996): ********************************* (633)
# [4996, 4998): ********************************************************** (1117)
# [4998, 5000): *************************************************** (986)

In [None]:
strong_components = g.components(mode='strong')
print(strong_components)
# because the videos we chose are very related with other videos in the dataset, so there is just one cluster

In [None]:
# to see the information of communities 
C = g.community_infomap()

# to see the number of communities
print(len(C))

# to see the size of each communities
for n in range(0,len(C)):
    print('Community nº', n, 'size:', len(C[n]))
    
# __output__:
# 1
# Community nº 0 size: 5000

In [None]:
# diameter: the longest path of the shortest paths between any two nodes.
print(g.diameter(directed=True))

# __output__: 2

In [None]:
# average shortest path

print(nx.average_shortest_path_length(G))

In [None]:
# cluster coefficient
# transitivity_undirected is 3 x (# of triangles) / (# of connected triplets)

print(g.transitivity_undirected())

# __output__ = 0.9986124578666629

In [None]:
# plot the graph

#z=g.layout('fr')
igraph.plot(g)

### __Analysis__:

__Community__: Because the videos we chose are very related with other videos in the dataset, so there is just one cluster

__Diameter__: Diameter is small in this graph, which means all the nodes in this graph connect very tightly.

__Cluster coefficient__: A clustering coefficient is a measure of the degree to which nodes in a graph tend to cluster together. The result is very high, which is line with real world for in most real-world networks, and in particular social networks, nodes tend to create tightly knit groups characterised by a relatively high density of ties.

## 3-Video characteristics analysis

### Category analysis

In [None]:
categories = sum([element[3:4] for element in total],[])
cat_list = list(set(categories)) # return a distinct category list

In [None]:
# to see which category have the highest number of videos

from collections import Counter
print(Counter(categories)) # to see the number of videos in each category

# plot a histgram to shwo the result
plt.figure(figsize=(5,5))
labels, values = zip(*Counter(categories).items())

indexes = np.arange(len(labels))
width = 0.5

plt.bar(indexes, values, width, color='Thistle')
plt.xticks(indexes + width * 0.5, labels, rotation=90)
plt.show()

In [None]:
# firstly, we define a function to calculate the number of views by certain classifier. Actually, I want badly to use Tableau to do so.

def Sum(clf,item,Type):
    Sum = sum([int(y) for x,y in zip(clf,Type) if x == item])
    return Sum

In [None]:
# to see which category have the highest unmber of views

views = sum([element[5:6] for element in total],[])
v_sum = [Sum(categories,cat, views) for cat in cat_list]
print('_Number of views by category_:','\n',{k:v for k,v in zip(cat_list,v_sum)})

In [None]:
# to see which category have the highest number of comments

comments = sum([element[7:8] for element in total],[])
c_sum = [Sum(categories,cat,comments) for cat in cat_list]
print('_Number of comments by category_:','\n',{k:v for k,v in zip(cat_list,c_sum)})

### Uploader analysis

In [None]:
# to see which uploader with highest number of videos

uploaders = sum([element[1:2] for element in total],[])
uploaders_list = list(set(uploaders))
print('The uploaders who has the highest number of videos is:',max(Counter(uploaders_list)))

In [None]:
# to see which uploader with highest number of views

uploader_v_sum = [Sum(uploaders,upl,views) for upl in uploaders_list]
print('The uploaders who has the highest number of views is:',
      {k:v for k,v in zip(uploader_v_sum,uploaders_list)}[max(uploader_v_sum)])

In [None]:
# to see which uploader with the highest number of comments

uploader_c_sum = [Sum(uploaders,upl,comments) for upl in uploaders_list]
print('The uploaders who has the highest number of comments is:',
      {k:v for k,v in zip(uploader_c_sum,uploaders_list)}[max(uploader_c_sum)])