In [3]:
# Standard Headers
# You are welcome to add additional headers here if you wish
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Enable inline mode for matplotlib so that Jupyter displays graphs
%matplotlib inline

# Social Network Dataset

Below are descriptions for each provided dataset. 

1. **person_knows_person**
This table represents the friendship between users. A person can have many friends. Persons have unique integer id number. 

2. **person_likes_post_file** 
This tables represents user likes. A person and a post is represented by IDs. 

3. **post_hasCreator_person**
A person creates many posts. Each post has a unique creator person id. 

4. **comment_replyOf_post**
This table represents comments on posts. Both post and comments have unique ids. 

5. **comment_hasCreator_person** 
A user can comment on posts. Each comment has a unique creator person id. 


In [4]:
url = ""
person_knows_person_file = url + "person_knows_person.csv"
person_likes_post_file = url + "person_likes_post.csv"
post_hasCreator_person_file = url + "post_hasCreator_person.csv"
comment_replyOf_post_file = url + "comment_replyOf_post.csv"
comment_hasCreator_person_file = url + "comment_hasCreator_person.csv"

person_knows_person = pd.read_csv(person_knows_person_file,  sep='|')
person_likes_post = pd.read_csv(person_likes_post_file,  sep='|')
post_hasCreator_person = pd.read_csv(post_hasCreator_person_file,  sep='|')
comment_replyOf_post = pd.read_csv(comment_replyOf_post_file, sep='|')
comment_hasCreator_person = pd.read_csv(comment_hasCreator_person_file,  sep='|')

print(person_knows_person.head(5))
print("-------------------------")

print(person_likes_post.head(5))
print("-------------------------")

print(post_hasCreator_person.head(5))
print("-------------------------")

print(comment_replyOf_post.head(5))
print("-------------------------")

print(comment_hasCreator_person.head(5))
print("-------------------------")

FileNotFoundError: [Errno 2] No such file or directory: 'person_knows_person.csv'

Cleaning Task 1 ~ Top-10 users who have the highest number of friends

In [5]:
# top 10 users with max friends using nlargest() 
friend_counts = person_knows_person["Person.id"].value_counts()
top_10 = friend_counts.nlargest(10)

# print top 10 users with max friends
print("Below is a list of the top-10 users with the highest number of friends:") 
print(top_10)

NameError: name 'person_knows_person' is not defined

Cleaning Task 2 ~ Most liked post 


In [30]:
# number of likes and max likes 
likes = person_likes_post["Post.id"].value_counts()
max_likes = likes.max()

# new df column counts 
counts = likes.reset_index().rename(columns={"index": "Post.id",0: "count"})

# max is equal to 9 and therefore set to 9
max_liked_posts = (counts[counts["count"] == 9])

# merge max liked posts to post.id of user 
most_liked_post = pd.merge( max_liked_posts, post_hasCreator_person, how = 'left', on = 'Post.id')
print(most_liked_post["Person.id"])

0        451
1        238
2        429
3        429
4         78
        ... 
13153     47
13154    359
13155    359
13156    463
13157    586
Name: Person.id, Length: 13158, dtype: int64


Cleaning Task 3 ~ The most influential post is the most discussed and most liked post. 


In [27]:
# merging num of occurences of comments and likes
num_comments = comment_replyOf_post["Post.id"].value_counts()
num_likes = person_likes_post["Post.id"].value_counts()
likes_and_comments = pd.merge(num_comments, num_likes, how = 'left', on='Post.id')

# creating comment and likes dataframes then merging 
likes_and_comments = likes_and_comments.rename(columns = {"count_x":"comments"})
likes_and_comments = likes_and_comments.rename(columns = {"count_y":"likes"})
likes_and_comments['total'] = likes_and_comments['comments'] + likes_and_comments['likes']
likes_and_comments = likes_and_comments.sort_values("total", ascending=False)
max_post = likes_and_comments['total'].idxmax()

# identify max person_id
person_id = (post_hasCreator_person[post_hasCreator_person["Post.id"] == max_post])["Person.id"]
print(f"User with the Person id {person_id.max()} wrote the most influential post.")

User with the Person id 649 wrote the most influential post.


Cleaning Task 4 ~ 2 histograms for the distributions of the number of likes and comments that users have created (including descriptions of data distributions)

In [2]:
# count num occurences of likes and comments per user
user_likes = person_likes_post["Person.id"].value_counts()
user_comments = comment_hasCreator_person["Person.id"].value_counts()

# likes per user histogram
plt.figure(figsize=(8,4))
user_likes.plot(kind='hist', bins=15, color = 'teal')
plt.title("total likes per user")
plt.xlabel('likes')
plt.ylabel('frequency')
plt.show()

# comments per user histogram
plt.figure(figsize=(8,4))
user_comments.plot(kind='hist', bins=15, color = 'magenta')
plt.title('total comments per user')
plt.xlabel('comments')
plt.ylabel('frequency')
plt.show()

# histogram distribution description 
print("Both histograms reveal that the distribution of the data is heavily right-skewed.")

NameError: name 'person_likes_post' is not defined

# Question - 5. What is the Pearson correlation coefficient between the number of comments and the number of likes that users do on the social network? (4 points)

Print out one number. 



**Tip:** You can calculate correlation coefficient using the following formula: 
Assume that x and y are two arrays of data, in this case number of likes and comments of each user. n is the number of users. 

**Bonus Tip:** Consider that some users might have liked posts, but not liked any comments, or vice versa.

You can use whichever technique you would like for the question, as long as it has been discussed in lecture.

 


\begin{align*}
r= {{ n(\sum x y ) - (\sum x ) (\sum y)    } \over { \sqrt{ [ n \sum x^2  -
(\sum x)^2 ] [ n\sum y^2 - (\sum y )^2 ] } } }
\end{align*}











In [26]:
# merge likes and comments by user
likes_and_comments = pd.merge(user_likes, user_comments, how = 'outer', on='Person.id')
likes_and_comments = likes_and_comments.rename(columns = {"count_y":"comments"})
likes_and_comments = likes_and_comments.rename(columns = {"count_x":"likes"})
num_users = len(likes_and_comments)

# replace 'NaN' vales with 0 both columns will only have ints 
likes_and_comments['likes'] = likes_and_comments['likes'].replace('NaN',0)
likes_and_comments['comments'] = likes_and_comments['comments'].replace('NaN',0)

# computing correlation coefficient 
corr_coeff = likes_and_comments['likes'].corr(likes_and_comments['comments'])
print(f"{round(corr_coeff, 2)} is the Pearson correlation coefficient between number of comments and number of likes.")


0.92 is the Pearson correlation coefficient between number of comments and number of likes.
