In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import xgboost as xgb


In [173]:
df = pd.read_csv('github-repo-data.csv')

In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150520 entries, 0 to 150519
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Name             150520 non-null  object 
 1   Description      144890 non-null  object 
 2   URL              150520 non-null  object 
 3   Created At       150520 non-null  object 
 4   Updated At       150520 non-null  object 
 5   Homepage         54809 non-null   object 
 6   Size             150520 non-null  int64  
 7   Stars            150520 non-null  int64  
 8   Forks            150518 non-null  float64
 9   Issues           150520 non-null  int64  
 10  Language         139289 non-null  object 
 11  License          113325 non-null  object 
 12  Topics           150520 non-null  object 
 13  Has Issues       150520 non-null  bool   
 14  Has Projects     150520 non-null  bool   
 15  Has Downloads    150520 non-null  bool   
 16  Has Wiki         150520 non-null  bool

In [175]:
# Drop columns that are not useful for prediction
columns_to_drop = ['Name', 'Description', 'URL', 'Homepage', 'Created At', 'Updated At', 'Default Branch', 'Language', "License"]

df = df.drop(columns = columns_to_drop)

In [176]:
def clean_topics(x):
    if isinstance(x, str) and x.startswith('[') and x.endswith(']'):
        # Remove the brackets and split by commas
        items = re.split(r',\s*', x[1:-1])
        # Strip any leading or trailing quotes and whitespace
        return [item.strip().strip("'").strip('"') for item in items]
    return []

# Apply the cleaning function
df['Topics'] = df['Topics'].apply(clean_topics)

# Join the list of words into a single string
df['Topics'] = df['Topics'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

In [177]:
# Vectorize the text
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Topics'])  

num_clusters = 50 
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster_Labels'] = kmeans.fit_predict(tfidf_matrix)

                                              Topics  Cluster_Labels
0  adt algebra algebraic algebraic-data-types fan...               0
1  converter markdown md pdf pdf-generation template              18
2  computer-vision dense-flow image-processing op...               0
3                                                                  0
4                       archived deprecated obsolete               0


In [178]:
df['Cluster_Labels'].value_counts()

Cluster_Labels
0     113287
46      2799
5       2466
19      2406
8       2085
31      1894
34      1556
21      1548
2       1352
47      1307
43      1294
35      1147
16      1129
24       919
23       913
45       884
25       876
13       869
9        798
6        759
38       756
36       740
33       738
42       724
20       673
37       619
26       568
32       542
4        533
1        467
10       412
48       282
30       261
17       260
11       257
14       254
7        248
18       240
41       235
39       187
29       187
15       180
40       168
49       154
12       134
27       130
22       120
3         55
44        55
28        53
Name: count, dtype: int64

In [179]:
df = df.drop('Topics', axis=1)
df.isnull().sum()

Size               0
Stars              0
Forks              2
Issues             0
Has Issues         0
Has Projects       0
Has Downloads      0
Has Wiki           0
Has Pages          0
Has Discussions    0
Is Fork            0
Is Archived        0
Is Template        0
Cluster_Labels     0
dtype: int64

In [180]:
df = df.dropna(subset=['Forks'])

In [161]:
# df['Language'].value_counts(dropna=False)

Language
Python        23973
JavaScript    22251
NaN           11231
Java          10759
TypeScript     8118
              ...  
Ballerina         1
Uno               1
D2                1
RPM Spec          1
ZIL               1
Name: count, Length: 342, dtype: int64

In [162]:
# print(df['License'].value_counts(dropna=False))

License
MIT                   54801
NaN                   37194
Apache-2.0            19220
NOASSERTION           14792
GPL-3.0                8978
BSD-3-Clause           3640
GPL-2.0                2432
AGPL-3.0               1647
BSD-2-Clause           1364
CC0-1.0                1039
MPL-2.0                 890
LGPL-3.0                821
Unlicense               801
ISC                     626
LGPL-2.1                428
CC-BY-4.0               330
CC-BY-SA-4.0            253
EPL-1.0                 248
WTFPL                   213
Zlib                    140
BSL-1.0                 117
OFL-1.1                 105
EPL-2.0                  82
MIT-0                    80
MS-PL                    51
OSL-3.0                  37
0BSD                     35
Artistic-2.0             26
LPPL-1.3c                21
BSD-4-Clause             16
EUPL-1.2                 15
PostgreSQL               15
BSD-3-Clause-Clear       14
UPL-1.0                  13
AFL-3.0                   7
NCSA        

In [163]:
# # Check the distribution of cols with missing values
# license_skew = df['License'].value_counts().skew() if not df['License'].isnull().all() else None
# language_skew = df['Language'].value_counts().skew() if not df['Language'].isnull().all() else None

# license_skew, language_skew

(5.072512415067896, 8.475609551100767)

In [164]:
# # Impute language col with the mode 
# mode_language = df['Language'].mode()[0]
# df['Language'].fillna(mode_language, inplace=True)

In [165]:
# # Fill license with Unknown (create new category)
# df['License'].fillna('Unknown', inplace=True)

In [166]:
# # Use one hot encoding
# df = pd.get_dummies(df, columns=['Language', 'License'], prefix=['Language', 'License'])

In [182]:
# # Converting all values to numeric
# df = df.replace({True: 1, False: 0})

In [184]:
X = df.drop(columns=['Stars'])
y = df['Stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to calcualte log rmse
def calculate_log_rmse(y_test, y_pred):
    # Convert lists to numpy arrays 
    y_test = np.array(y_test)
    y_pred = np.array(y_pred)

    # Ensure all predicted values are non-negative
    y_pred = np.maximum(y_pred, 0)
    
    # Calculate the logarithm of the star counts plus one
    log_true = np.log(y_test + 1)
    log_predicted = np.log(y_pred + 1)
    
    # Calculate the squared differences
    squared_diff = (log_true - log_predicted) ** 2
    
    # Compute the mean of the squared differences
    mean_squared_diff = np.mean(squared_diff)
    
    # Take the square root of the mean to get the RMSE
    log_rmse = np.sqrt(mean_squared_diff)
    
    return log_rmse

In [201]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Train
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)

xgb_log_rmse = calculate_log_rmse(y_test, y_pred)

In [204]:
xgb_log_rmse

0.6674834471514748

In [188]:
test_df = pd.read_csv('github-repo-prediction-set.csv')

In [190]:
columns_to_drop = ['Name', 'Description', 'URL', 'Homepage', 'Created At', 'Updated At', 'Default Branch', 'Language', "License"]

test_df = test_df.drop(columns = columns_to_drop)

def clean_topics(x):
    if isinstance(x, str) and x.startswith('[') and x.endswith(']'):
        # Remove the brackets and split by commas
        items = re.split(r',\s*', x[1:-1])
        # Strip any leading or trailing quotes and whitespace
        return [item.strip().strip("'").strip('"') for item in items]
    return []

# Apply the cleaning function
test_df['Topics'] = test_df['Topics'].apply(clean_topics)

# Join the list of words into a single string
test_df['Topics'] = test_df['Topics'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(test_df['Topics'])  

num_clusters = 50 
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
test_df['Cluster_Labels'] = kmeans.fit_predict(tfidf_matrix)

test_df = test_df.drop('Topics', axis=1)

test_df = test_df.dropna(subset=['Forks'])

test_df = test_df.replace({True: 1, False: 0})

In [194]:
test_predictions = xgb_model.predict(test_df)

In [195]:
test_predictions

array([294.51886, 267.5087 , 309.9944 , ..., 568.4792 , 581.69635,
       859.2944 ], dtype=float32)

In [192]:
sub_file = pd.read_csv('submission-file.csv')

In [196]:
sub_file['Predicted_Stars'] = test_predictions

Visualization of Topics and Languages

In [216]:
df1 = pd.read_csv('github-repo-data.csv')

In [217]:
def clean_topics(x):
    if isinstance(x, str) and x.startswith('[') and x.endswith(']'):
        # Remove the brackets and split by commas
        items = re.split(r',\s*', x[1:-1])
        # Strip any leading or trailing quotes and whitespace
        return [item.strip().strip("'").strip('"') for item in items]
    return []

# Apply the cleaning function
df1['Topics'] = df1['Topics'].apply(clean_topics)

# Join the list of words into a single string
df1['Topics'] = df1['Topics'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df1['Topics'])  

num_clusters = 50 
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df1['Cluster_Labels'] = kmeans.fit_predict(tfidf_matrix)

df1 = df1.dropna(subset=['Forks'])

df1 = df1.replace({True: 1, False: 0})

In [219]:
relationship = pd.crosstab(df1['Topics'], df1['Language'])

print("Relationship between Clusters and Languages:")
print(relationship)



Relationship between Clusters and Languages:
Language                                            1C Enterprise  ABAP  \
Topics                                                                    
                                                                2     1   
0-rtt golang hijacking http-proxy multiplexing ...              0     0   
010editor binary-data dataparsing interpreter p...              0     0   
011y apm clickhouse grafana influxdb3 lgtm logq...              0     0   
053 3368 alpha emulator mmo python warcraft wow                 0     0   
...                                                           ...   ...   
zotero                                                          0     0   
zplug zsh                                                       0     0   
zsh                                                             0     0   
zsh-completions zsh-plugins                                     0     0   
zsh-theme                                              

In [220]:
relationship

Language,1C Enterprise,ABAP,AGS Script,AL,AMPL,ANTLR,APL,ASL,ASP,ASP.NET,...,eC,hoon,jq,mcfunction,nesC,ooc,q,reStructuredText,sed,wisp
Topics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,2,1,6,2,1,2,1,8,4,1,...,0,0,0,1,3,1,1,0,1,1
0-rtt golang hijacking http-proxy multiplexing proxy quic,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
010editor binary-data dataparsing interpreter python,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
011y apm clickhouse grafana influxdb3 lgtm logql logs loki metrics monitoring observability opentelemetry otlp prometheus promql remote-write tempo timeseries zipkin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
053 3368 alpha emulator mmo python warcraft wow,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zotero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zplug zsh,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zsh,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zsh-completions zsh-plugins,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df.to_csv("submission.csv")