# Importing Necessary Libraries

In [1]:
import pandas as pd
import json

# Reading Data

In [2]:
def read_large_json(file_path, chunksize=10000):
    """Reads a large JSON file in chunks and yields DataFrames.

    Args:
        file_path (str): Path to the JSON file.
        chunksize (int, optional): Number of lines to read at a time. Defaults to 1000.

    Yields:
        pandas.DataFrame: A chunk of the data as a DataFrame.
    """

    with open(file_path, "r", encoding="utf-8") as file:
        reader = pd.read_json(file, lines=True, chunksize=chunksize)
        for chunk in reader:
            yield chunk

# Example usage
data = pd.DataFrame()
for chunk in read_large_json("D:/GhArchive/Demo_Processed_Files&Script/2023-01-01-0.json"):    # or get the demo file here: https://drive.google.com/file/d/11lUM4tU1sRsgMwvE-AFwkWKnW9vpS_r3/view
    data = pd.concat([data, chunk], ignore_index=True)

data.head()  # View the first few rows


Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,26163418658,PushEvent,"{'id': 119809980, 'login': 'ehwu106', 'display...","{'id': 582174284, 'name': 'ehwu106/Gmail-Filte...","{'push_id': 12147229638, 'size': 2, 'distinct_...",True,2023-01-01 00:00:00+00:00,
1,26163418659,PushEvent,"{'id': 77187908, 'login': 'Cefqrn', 'display_l...","{'id': 583140987, 'name': 'Cefqrn/pyxpr', 'url...","{'push_id': 12147229637, 'size': 2, 'distinct_...",True,2023-01-01 00:00:00+00:00,
2,26163418660,IssuesEvent,"{'id': 121737278, 'login': 'LaymooDR', 'displa...","{'id': 383940088, 'name': 'ShadowMario/FNF-Psy...","{'action': 'opened', 'issue': {'url': 'https:/...",True,2023-01-01 00:00:00+00:00,
3,26163418664,WatchEvent,"{'id': 89544871, 'login': 'Aziz403', 'display_...","{'id': 2663796, 'name': 'lexik/LexikTranslatio...",{'action': 'started'},True,2023-01-01 00:00:00+00:00,"{'id': 568486, 'login': 'lexik', 'gravatar_id'..."
4,26163418665,PushEvent,"{'id': 8517910, 'login': 'LombiqBot', 'display...","{'id': 410004154, 'name': 'Lombiq/TheBootstrap...","{'push_id': 12147229641, 'size': 0, 'distinct_...",True,2023-01-01 00:00:00+00:00,"{'id': 8158177, 'login': 'Lombiq', 'gravatar_i..."


In [9]:
data

Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,26163418658,PushEvent,"{'id': 119809980, 'login': 'ehwu106', 'display...","{'id': 582174284, 'name': 'ehwu106/Gmail-Filte...","{'push_id': 12147229638, 'size': 2, 'distinct_...",True,2023-01-01 00:00:00+00:00,
1,26163418659,PushEvent,"{'id': 77187908, 'login': 'Cefqrn', 'display_l...","{'id': 583140987, 'name': 'Cefqrn/pyxpr', 'url...","{'push_id': 12147229637, 'size': 2, 'distinct_...",True,2023-01-01 00:00:00+00:00,
2,26163418660,IssuesEvent,"{'id': 121737278, 'login': 'LaymooDR', 'displa...","{'id': 383940088, 'name': 'ShadowMario/FNF-Psy...","{'action': 'opened', 'issue': {'url': 'https:/...",True,2023-01-01 00:00:00+00:00,
3,26163418664,WatchEvent,"{'id': 89544871, 'login': 'Aziz403', 'display_...","{'id': 2663796, 'name': 'lexik/LexikTranslatio...",{'action': 'started'},True,2023-01-01 00:00:00+00:00,"{'id': 568486, 'login': 'lexik', 'gravatar_id'..."
4,26163418665,PushEvent,"{'id': 8517910, 'login': 'LombiqBot', 'display...","{'id': 410004154, 'name': 'Lombiq/TheBootstrap...","{'push_id': 12147229641, 'size': 0, 'distinct_...",True,2023-01-01 00:00:00+00:00,"{'id': 8158177, 'login': 'Lombiq', 'gravatar_i..."
...,...,...,...,...,...,...,...,...
101912,26163700494,PushEvent,"{'id': 93773753, 'login': 'BlaaSwe', 'display_...","{'id': 568383720, 'name': 'BlaaSwe/Base-Soarin...","{'push_id': 12147404982, 'size': 1, 'distinct_...",True,2023-01-01 00:59:59+00:00,
101913,26163700498,PushEvent,"{'id': 110655703, 'login': 'sfj297092319', 'di...","{'id': 530264309, 'name': 'usahexo/2452bb12a82...","{'push_id': 12147404988, 'size': 1, 'distinct_...",True,2023-01-01 00:59:59+00:00,"{'id': 110656312, 'login': 'usahexo', 'gravata..."
101914,26163700502,IssueCommentEvent,"{'id': 55374212, 'login': 'GuillaumePrata', 'd...","{'id': 3234987, 'name': 'tgstation/tgstation',...","{'action': 'created', 'issue': {'url': 'https:...",True,2023-01-01 00:59:59+00:00,"{'id': 1363778, 'login': 'tgstation', 'gravata..."
101915,26163700504,PushEvent,"{'id': 35409597, 'login': 'JetBlack011', 'disp...","{'id': 541393257, 'name': 'JetBlack011/convex-...","{'push_id': 12147404989, 'size': 1, 'distinct_...",True,2023-01-01 00:59:59+00:00,


# Checking Data Information

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101917 entries, 0 to 101916
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype              
---  ------      --------------   -----              
 0   id          101917 non-null  int64              
 1   type        101917 non-null  object             
 2   actor       101917 non-null  object             
 3   repo        101917 non-null  object             
 4   payload     101917 non-null  object             
 5   public      101917 non-null  bool               
 6   created_at  101917 non-null  datetime64[ns, UTC]
 7   org         25639 non-null   object             
dtypes: bool(1), datetime64[ns, UTC](1), int64(1), object(5)
memory usage: 5.5+ MB


# Type of Events

In [4]:
data['type'].value_counts()

type
PushEvent                        61501
CreateEvent                      12414
PullRequestEvent                 10440
IssueCommentEvent                 5414
DeleteEvent                       4019
WatchEvent                        3023
IssuesEvent                       1610
PullRequestReviewEvent             832
ForkEvent                          747
ReleaseEvent                       743
CommitCommentEvent                 593
PullRequestReviewCommentEvent      281
PublicEvent                        146
GollumEvent                        105
MemberEvent                         49
Name: count, dtype: int64

# Extracting Repository Information

In [12]:
# Extracting values from the 'repo' column
repo_df = pd.json_normalize(data['repo'])

In [13]:
repo_df

Unnamed: 0,id,name,url
0,582174284,ehwu106/Gmail-Filter-Solution,https://api.github.com/repos/ehwu106/Gmail-Fil...
1,583140987,Cefqrn/pyxpr,https://api.github.com/repos/Cefqrn/pyxpr
2,383940088,ShadowMario/FNF-PsychEngine,https://api.github.com/repos/ShadowMario/FNF-P...
3,2663796,lexik/LexikTranslationBundle,https://api.github.com/repos/lexik/LexikTransl...
4,410004154,Lombiq/TheBootstrapTheme,https://api.github.com/repos/Lombiq/TheBootstr...
...,...,...,...
101912,568383720,BlaaSwe/Base-Soaring-Signalytic,https://api.github.com/repos/BlaaSwe/Base-Soar...
101913,530264309,usahexo/2452bb12a8217,https://api.github.com/repos/usahexo/2452bb12a...
101914,3234987,tgstation/tgstation,https://api.github.com/repos/tgstation/tgstation
101915,541393257,JetBlack011/convex-structures,https://api.github.com/repos/JetBlack011/conve...


In [7]:
repo_df['name'].nunique()

43782

In [8]:
repo_df['name'].value_counts()

name
Lombiq/Orchard                           1398
unifyai/ivy                               885
B4kedBr3ad/yo                             605
Rolleander/yoyo_mirror                    585
thoth-station/prescriptions               583
                                         ... 
x3fang/x3fang.github.io                     1
DatBoiTim/DBT-BeeStation                    1
cool-RR/PySnooper                           1
enricopolanski/functional-programming       1
JetBlack011/convex-structures               1
Name: count, Length: 43782, dtype: int64

In [16]:
id = data['id']

In [17]:
id

0         26163418658
1         26163418659
2         26163418660
3         26163418664
4         26163418665
             ...     
101912    26163700494
101913    26163700498
101914    26163700502
101915    26163700504
101916    26163701098
Name: id, Length: 101917, dtype: int64

In [20]:
repo_df.columns= ['repo_id', 'repo_name', 'repo_url']

In [26]:
repo_info = pd.concat([id, repo_df], axis=1)

In [27]:
repo_info

Unnamed: 0,id,repo_id,repo_name,repo_url
0,26163418658,582174284,ehwu106/Gmail-Filter-Solution,https://api.github.com/repos/ehwu106/Gmail-Fil...
1,26163418659,583140987,Cefqrn/pyxpr,https://api.github.com/repos/Cefqrn/pyxpr
2,26163418660,383940088,ShadowMario/FNF-PsychEngine,https://api.github.com/repos/ShadowMario/FNF-P...
3,26163418664,2663796,lexik/LexikTranslationBundle,https://api.github.com/repos/lexik/LexikTransl...
4,26163418665,410004154,Lombiq/TheBootstrapTheme,https://api.github.com/repos/Lombiq/TheBootstr...
...,...,...,...,...
101912,26163700494,568383720,BlaaSwe/Base-Soaring-Signalytic,https://api.github.com/repos/BlaaSwe/Base-Soar...
101913,26163700498,530264309,usahexo/2452bb12a8217,https://api.github.com/repos/usahexo/2452bb12a...
101914,26163700502,3234987,tgstation/tgstation,https://api.github.com/repos/tgstation/tgstation
101915,26163700504,541393257,JetBlack011/convex-structures,https://api.github.com/repos/JetBlack011/conve...


In [28]:
repo_info.to_csv('D:/GhArchive/Demo_Processed_Files&Script/repo_info.csv', index= False)

# Group data according to event types

In [29]:
data['type'].value_counts()

type
PushEvent                        61501
CreateEvent                      12414
PullRequestEvent                 10440
IssueCommentEvent                 5414
DeleteEvent                       4019
WatchEvent                        3023
IssuesEvent                       1610
PullRequestReviewEvent             832
ForkEvent                          747
ReleaseEvent                       743
CommitCommentEvent                 593
PullRequestReviewCommentEvent      281
PublicEvent                        146
GollumEvent                        105
MemberEvent                         49
Name: count, dtype: int64

## Push Events

In [30]:
push_events= data[data['type']=='PushEvent']

In [31]:
push_events

Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,26163418658,PushEvent,"{'id': 119809980, 'login': 'ehwu106', 'display...","{'id': 582174284, 'name': 'ehwu106/Gmail-Filte...","{'push_id': 12147229638, 'size': 2, 'distinct_...",True,2023-01-01 00:00:00+00:00,
1,26163418659,PushEvent,"{'id': 77187908, 'login': 'Cefqrn', 'display_l...","{'id': 583140987, 'name': 'Cefqrn/pyxpr', 'url...","{'push_id': 12147229637, 'size': 2, 'distinct_...",True,2023-01-01 00:00:00+00:00,
4,26163418665,PushEvent,"{'id': 8517910, 'login': 'LombiqBot', 'display...","{'id': 410004154, 'name': 'Lombiq/TheBootstrap...","{'push_id': 12147229641, 'size': 0, 'distinct_...",True,2023-01-01 00:00:00+00:00,"{'id': 8158177, 'login': 'Lombiq', 'gravatar_i..."
5,26163418667,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 250035045, 'name': 'ZamulaK/COVID-19', ...","{'push_id': 12147229642, 'size': 1, 'distinct_...",True,2023-01-01 00:00:00+00:00,
6,26163418668,PushEvent,"{'id': 118964436, 'login': 'SS7SS', 'display_l...","{'id': 583726411, 'name': 'SS7SS/Quran_Linux',...","{'push_id': 12147229640, 'size': 1, 'distinct_...",True,2023-01-01 00:00:00+00:00,
...,...,...,...,...,...,...,...,...
101910,26163700491,PushEvent,"{'id': 91348256, 'login': 'Pho86', 'display_lo...","{'id': 553257090, 'name': 'Pho86/WaterTracker'...","{'push_id': 12147404986, 'size': 1, 'distinct_...",True,2023-01-01 00:59:59+00:00,
101911,26163700492,PushEvent,"{'id': 121700457, 'login': 'jessieaaguirre', '...","{'id': 583801195, 'name': 'jessieaaguirre/cv',...","{'push_id': 12147404983, 'size': 1, 'distinct_...",True,2023-01-01 00:59:59+00:00,
101912,26163700494,PushEvent,"{'id': 93773753, 'login': 'BlaaSwe', 'display_...","{'id': 568383720, 'name': 'BlaaSwe/Base-Soarin...","{'push_id': 12147404982, 'size': 1, 'distinct_...",True,2023-01-01 00:59:59+00:00,
101913,26163700498,PushEvent,"{'id': 110655703, 'login': 'sfj297092319', 'di...","{'id': 530264309, 'name': 'usahexo/2452bb12a82...","{'push_id': 12147404988, 'size': 1, 'distinct_...",True,2023-01-01 00:59:59+00:00,"{'id': 110656312, 'login': 'usahexo', 'gravata..."


In [32]:
push_events.isnull().sum()

id                0
type              0
actor             0
repo              0
payload           0
public            0
created_at        0
org           49010
dtype: int64

In [33]:
push_events.drop(columns=['org'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  push_events.drop(columns=['org'], inplace=True)


In [34]:
push_events.isnull().sum()

id            0
type          0
actor         0
repo          0
payload       0
public        0
created_at    0
dtype: int64

In [84]:
push_id = push_events ['Id']

In [93]:
# Normalize the 'actor' column
push_actor_df = pd.json_normalize(push_events['actor'])

# Rename the columns to include the prefix 'actor_'
push_actor_df.columns = [f"actor_{col}" for col in push_actor_df.columns]


# Reset the index of 'push_id' and 'push_actor_df'
push_id.reset_index(drop=True, inplace=True)
push_actor_df.reset_index(drop=True, inplace=True)

# Concatenate 'push_id' with 'push_actor_df'
push_actor_df = pd.concat([push_id, push_actor_df], axis=1)

# Display the shape of the concatenated DataFrame
print(push_actor_df.shape)



(61501, 7)


In [91]:
push_id

0         26163418658
1         26163418659
4         26163418665
5         26163418667
6         26163418668
             ...     
101910    26163700491
101911    26163700492
101912    26163700494
101913    26163700498
101915    26163700504
Name: Id, Length: 61501, dtype: int64

In [90]:
push_actor_df

Unnamed: 0,0,1,2,3,4,5,6
0,2.616342e+10,119809980.0,ehwu106,ehwu106,,https://api.github.com/users/ehwu106,https://avatars.githubusercontent.com/u/119809...
1,2.616342e+10,77187908.0,Cefqrn,Cefqrn,,https://api.github.com/users/Cefqrn,https://avatars.githubusercontent.com/u/77187908?
4,2.616342e+10,118964436.0,SS7SS,SS7SS,,https://api.github.com/users/SS7SS,https://avatars.githubusercontent.com/u/118964...
5,2.616342e+10,41898282.0,github-actions[bot],github-actions,,https://api.github.com/users/github-actions[bot],https://avatars.githubusercontent.com/u/41898282?
6,2.616342e+10,21151734.0,drphil3d,drphil3d,,https://api.github.com/users/drphil3d,https://avatars.githubusercontent.com/u/21151734?
...,...,...,...,...,...,...,...
61485,,41898282.0,github-actions[bot],github-actions,,https://api.github.com/users/github-actions[bot],https://avatars.githubusercontent.com/u/41898282?
61487,,8581566.0,FRE3X,FRE3X,,https://api.github.com/users/FRE3X,https://avatars.githubusercontent.com/u/8581566?
61491,,41898282.0,github-actions[bot],github-actions,,https://api.github.com/users/github-actions[bot],https://avatars.githubusercontent.com/u/41898282?
61495,,41898282.0,github-actions[bot],github-actions,,https://api.github.com/users/github-actions[bot],https://avatars.githubusercontent.com/u/41898282?


In [94]:
# Normalize the 'actor' column
push_repo_df = pd.json_normalize(push_events['repo'])

# Rename the columns to include the prefix 'actor_'
push_repo_df.columns = [f"repo_{col}" for col in push_repo_df.columns]

#push_actor_df = pd.concat([push_id, push_actor_df], axis= 1)


In [49]:
# Normalize the 'actor' column
push_payload_df = pd.json_normalize(push_events['payload'])

# Rename the columns to include the prefix 'actor_'
push_payload_df.columns = [f"payload_{col}" for col in push_payload_df.columns]

In [70]:
push_actor_df.isnull().sum()

actor_id               0
actor_login            0
actor_display_login    0
actor_gravatar_id      0
actor_url              0
actor_avatar_url       0
dtype: int64

In [71]:
push_repo_df.isnull().sum()

repo_id      0
repo_name    0
repo_url     0
dtype: int64

In [73]:
push_payload_df.isnull().sum()

payload_push_id          0
payload_size             0
payload_distinct_size    0
payload_ref              0
payload_head             0
payload_before           0
payload_commits          0
dtype: int64

In [74]:
push_info.isnull().sum()

Id            0
type          0
created_at    0
dtype: int64

In [50]:
push_events.columns

Index(['Id', 'type', 'actor', 'repo', 'payload', 'public', 'created_at'], dtype='object')

In [51]:
push_events.columns= ['Id', 'type', 'actor', 'repo', 'payload', 'public', 'created_at']

In [52]:
push_info= push_events [['Id', 'type', 'created_at']]

In [95]:
# Reset the indices of all DataFrames
push_info.reset_index(drop=True, inplace=True)
push_actor_df.reset_index(drop=True, inplace=True)
push_repo_df.reset_index(drop=True, inplace=True)
push_payload_df.reset_index(drop=True, inplace=True)

# Concatenate the DataFrames to create 'push_event_detailed'
push_event_detailed = pd.concat([push_info, push_actor_df, push_repo_df, push_payload_df], axis=1)

In [96]:
push_event_detailed

Unnamed: 0,Id,type,created_at,Id.1,actor_id,actor_login,actor_display_login,actor_gravatar_id,actor_url,actor_avatar_url,repo_id,repo_name,repo_url,payload_push_id,payload_size,payload_distinct_size,payload_ref,payload_head,payload_before,payload_commits
0,26163418658,PushEvent,2023-01-01 00:00:00+00:00,26163418658,119809980,ehwu106,ehwu106,,https://api.github.com/users/ehwu106,https://avatars.githubusercontent.com/u/119809...,582174284,ehwu106/Gmail-Filter-Solution,https://api.github.com/repos/ehwu106/Gmail-Fil...,12147229638,2,2,refs/heads/main,8fbcb0a5be7f1ae98c620ffc445f8212da279c4b,27e76fd2920c98cf825daefa9469cb202944d96d,[{'sha': '01882b15808c6cc63f4075eea105de4f608e...
1,26163418659,PushEvent,2023-01-01 00:00:00+00:00,26163418659,77187908,Cefqrn,Cefqrn,,https://api.github.com/users/Cefqrn,https://avatars.githubusercontent.com/u/77187908?,583140987,Cefqrn/pyxpr,https://api.github.com/repos/Cefqrn/pyxpr,12147229637,2,2,refs/heads/main,f64e5c366a20276fd1499cc485e131c08aeba5ee,367a5d47552f98e7acffe2f20b9a8c82f34f71a9,[{'sha': '61696611ba5a9edd83c997d8a6cc477fa483...
2,26163418665,PushEvent,2023-01-01 00:00:00+00:00,26163418665,8517910,LombiqBot,LombiqBot,,https://api.github.com/users/LombiqBot,https://avatars.githubusercontent.com/u/8517910?,410004154,Lombiq/TheBootstrapTheme,https://api.github.com/repos/Lombiq/TheBootstr...,12147229641,0,0,refs/heads/master,1c0d357e00552ca5a53e2a94573fd9d6f73fcdf5,1c0d357e00552ca5a53e2a94573fd9d6f73fcdf5,[]
3,26163418667,PushEvent,2023-01-01 00:00:00+00:00,26163418667,41898282,github-actions[bot],github-actions,,https://api.github.com/users/github-actions[bot],https://avatars.githubusercontent.com/u/41898282?,250035045,ZamulaK/COVID-19,https://api.github.com/repos/ZamulaK/COVID-19,12147229642,1,1,refs/heads/web-data,95bf957f5188641598151af8c35872686f18bd85,db49e743d0ea2ce8aa87ebfd2de9936853a1ba95,[{'sha': '95bf957f5188641598151af8c35872686f18...
4,26163418668,PushEvent,2023-01-01 00:00:00+00:00,26163418668,118964436,SS7SS,SS7SS,,https://api.github.com/users/SS7SS,https://avatars.githubusercontent.com/u/118964...,583726411,SS7SS/Quran_Linux,https://api.github.com/repos/SS7SS/Quran_Linux,12147229640,1,1,refs/heads/tepthon,2585e306dd64c4b529956550bd2e48467d211fd7,39b84f330971e62a6b4d7931f4cbbe045e55c3af,[{'sha': '2585e306dd64c4b529956550bd2e48467d21...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61496,26163700491,PushEvent,2023-01-01 00:59:59+00:00,26163700491,91348256,Pho86,Pho86,,https://api.github.com/users/Pho86,https://avatars.githubusercontent.com/u/91348256?,553257090,Pho86/WaterTracker,https://api.github.com/repos/Pho86/WaterTracker,12147404986,1,1,refs/heads/main,b68cbbbaf8cc41cc91d8bf99074ee21cd41c3593,3a3b3709264cdbfa4b2887a36c56cb684c04dfa1,[{'sha': 'b68cbbbaf8cc41cc91d8bf99074ee21cd41c...
61497,26163700492,PushEvent,2023-01-01 00:59:59+00:00,26163700492,121700457,jessieaaguirre,jessieaaguirre,,https://api.github.com/users/jessieaaguirre,https://avatars.githubusercontent.com/u/121700...,583801195,jessieaaguirre/cv,https://api.github.com/repos/jessieaaguirre/cv,12147404983,1,1,refs/heads/main,28428ae14ec1d1f3b6fc22961a8a682f926fce79,11aabbc69f0941f38b3905cfb7f366390611656f,[{'sha': '28428ae14ec1d1f3b6fc22961a8a682f926f...
61498,26163700494,PushEvent,2023-01-01 00:59:59+00:00,26163700494,93773753,BlaaSwe,BlaaSwe,,https://api.github.com/users/BlaaSwe,https://avatars.githubusercontent.com/u/93773753?,568383720,BlaaSwe/Base-Soaring-Signalytic,https://api.github.com/repos/BlaaSwe/Base-Soar...,12147404982,1,1,refs/heads/main,2f9e119720495cc866feee8917395918f42822c1,6ec62036c0ce829f587df3bf5df78729bab83420,[{'sha': '2f9e119720495cc866feee8917395918f428...
61499,26163700498,PushEvent,2023-01-01 00:59:59+00:00,26163700498,110655703,sfj297092319,sfj297092319,,https://api.github.com/users/sfj297092319,https://avatars.githubusercontent.com/u/110655...,530264309,usahexo/2452bb12a8217,https://api.github.com/repos/usahexo/2452bb12a...,12147404988,1,1,refs/heads/main,d44bb3277a08429d0069dfd24db8adc3f0967bb9,2357ef55a5d425323c3be174545fdedf56042ab8,[{'sha': 'd44bb3277a08429d0069dfd24db8adc3f096...


In [97]:
push_event_detailed.isnull().sum()

Id                       0
type                     0
created_at               0
Id                       0
actor_id                 0
actor_login              0
actor_display_login      0
actor_gravatar_id        0
actor_url                0
actor_avatar_url         0
repo_id                  0
repo_name                0
repo_url                 0
payload_push_id          0
payload_size             0
payload_distinct_size    0
payload_ref              0
payload_head             0
payload_before           0
payload_commits          0
dtype: int64

### Further Normalizing Payload Commit Info

In [99]:
push_event_detailed.columns

Index(['Id', 'type', 'created_at', 'Id', 'actor_id', 'actor_login',
       'actor_display_login', 'actor_gravatar_id', 'actor_url',
       'actor_avatar_url', 'repo_id', 'repo_name', 'repo_url',
       'payload_push_id', 'payload_size', 'payload_distinct_size',
       'payload_ref', 'payload_head', 'payload_before', 'payload_commits'],
      dtype='object')

In [100]:
push_event_detailed.isnull().sum()

Id                       0
type                     0
created_at               0
Id                       0
actor_id                 0
actor_login              0
actor_display_login      0
actor_gravatar_id        0
actor_url                0
actor_avatar_url         0
repo_id                  0
repo_name                0
repo_url                 0
payload_push_id          0
payload_size             0
payload_distinct_size    0
payload_ref              0
payload_head             0
payload_before           0
payload_commits          0
dtype: int64

In [102]:
push_event_detailed.columns

Index(['Id', 'type', 'created_at', 'Id', 'actor_id', 'actor_login',
       'actor_display_login', 'actor_gravatar_id', 'actor_url',
       'actor_avatar_url', 'repo_id', 'repo_name', 'repo_url',
       'payload_push_id', 'payload_size', 'payload_distinct_size',
       'payload_ref', 'payload_head', 'payload_before', 'payload_commits'],
      dtype='object')

In [103]:
push_event_detailed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61501 entries, 0 to 61500
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   Id                     61501 non-null  int64              
 1   type                   61501 non-null  object             
 2   created_at             61501 non-null  datetime64[ns, UTC]
 3   Id                     61501 non-null  int64              
 4   actor_id               61501 non-null  int64              
 5   actor_login            61501 non-null  object             
 6   actor_display_login    61501 non-null  object             
 7   actor_gravatar_id      61501 non-null  object             
 8   actor_url              61501 non-null  object             
 9   actor_avatar_url       61501 non-null  object             
 10  repo_id                61501 non-null  int64              
 11  repo_name              61501 non-null  object         

In [105]:
# Normalize the 'actor' column
push_payload_commits_df = pd.json_normalize(push_event_detailed['payload_commits'])

# Rename the columns to include the prefix 'actor_'
push_payload_commits_df.columns = [f"payload_commits{col}" for col in push_payload_commits_df.columns]

In [111]:
push_payload_commits_df.isnull().sum()

payload_commits0      4454
payload_commits1     55517
payload_commits2     59076
payload_commits3     59663
payload_commits4     60266
payload_commits5     60564
payload_commits6     60720
payload_commits7     60817
payload_commits8     60879
payload_commits9     60931
payload_commits10    60980
payload_commits11    61018
payload_commits12    61037
payload_commits13    61067
payload_commits14    61112
payload_commits15    61129
payload_commits16    61153
payload_commits17    61169
payload_commits18    61179
payload_commits19    61188
dtype: int64

#### It can be processed further, this is not our goal so will continue it later, meanwhile we convert our desired file into csv.

In [127]:
push_event_detailed.to_csv('D:/GhArchive/Demo_Processed_Files&Script/push_event_detailed.csv', index=False)

#### As csv file doesn't contain metadata, so we have to save the file as parquet, which store metadata as well, and for giant file it's best to save file as 

#### parquet file, due to its memory efficient properties.

#### CSV File Size : 50,128 KB
#### Parquet File Size: 18,914 KB , this is the significant difference, in our case the file size reduced with 73%

In [121]:
# Save the DataFrame to a Parquet file
push_event_detailed.to_parquet('D:/GhArchive/Demo_Processed_Files&Script/push_event_detailed.parquet', index=False)

### Let's test whether it saves meta_data or not

In [122]:
push_event_detailed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61501 entries, 0 to 61500
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   Id                     61501 non-null  int64              
 1   type                   61501 non-null  object             
 2   created_at             61501 non-null  datetime64[ns, UTC]
 3   actor_id               61501 non-null  int64              
 4   actor_login            61501 non-null  object             
 5   actor_display_login    61501 non-null  object             
 6   actor_gravatar_id      61501 non-null  object             
 7   actor_url              61501 non-null  object             
 8   actor_avatar_url       61501 non-null  object             
 9   repo_id                61501 non-null  int64              
 10  repo_name              61501 non-null  object             
 11  repo_url               61501 non-null  object         

In [123]:
# To read the Parquet file back into a DataFrame
df_from_parquet = pd.read_parquet('D:/GhArchive/Demo_Processed_Files&Script/push_event_detailed.parquet')

In [125]:
df_from_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61501 entries, 0 to 61500
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   Id                     61501 non-null  int64              
 1   type                   61501 non-null  object             
 2   created_at             61501 non-null  datetime64[ns, UTC]
 3   actor_id               61501 non-null  int64              
 4   actor_login            61501 non-null  object             
 5   actor_display_login    61501 non-null  object             
 6   actor_gravatar_id      61501 non-null  object             
 7   actor_url              61501 non-null  object             
 8   actor_avatar_url       61501 non-null  object             
 9   repo_id                61501 non-null  int64              
 10  repo_name              61501 non-null  object             
 11  repo_url               61501 non-null  object         

#### Same process can be adpoted for different event analysis like: pull_request_event, create_event and all.

# Next Event