In [1]:
import pandas as pd
import re

# Preprocess the Company Column
As to create a sample that can be used for the project as a benchmark, a sample is required which is also representative. The preprocessing will reveal 1st layer of challenges.

In [2]:
github_data = pd.read_csv('non_missing_company_data.csv', index_col=False)

In [5]:
print(github_data[github_data['idd'] == 1903].to_string())

      idd                  name                                                                                                                                                       bio                                               website                           company         id            description                                                                                                                                                                                                        readme
756  1903  deeplearning-chatbot  I enjoy the aggregate process of forward propagation and backward propagation by my neurons majorly focused towards the applications of Neural Networks.  https://www.linkedin.com/in/arijit-ganguly-27224083/  University of Texas at Arlington  138488180  Deep Learning Chatbot  # Chatbot using SEQ2SEQ model\r\nBased on Udemy tutorials and data, weplan to build a SEQ2SEQ model to chat us.\r\n\r\n## Built With \r\n* Python 3.8.8\r\n\r\n## Packages Used\r

In [4]:
github_data.head()

Unnamed: 0,idd,name,bio,website,company,id,description,readme
0,1,Algorithms-DataStructures-Python,Former Deep Learning Intern | Lead Programmer...,https://www.callitabhi.com/,CallitAbhi,672799594,Welcome to my Algorithms and Data Structures r...,# Algorithms-DataStructures-Python\r\n\r\n- `b...
1,9,Hindi-character-recognition,,,Sarvajanik College of Engineering & Technology,667535261,"In this work, we propose a technique to recogn...","# Hindi-character-recognition\r\nIn this work,..."
2,12,plant_leaf_disease_detection,Machine learning & Deep Learning Practitioner....,https://abhinav3.github.io/,IIT Guwahati,692408006,plant leaf disease detection,plant leaf disease detection and classificatio...
3,16,tello_python,IoT | Deep Learning | Machine Learning | Embed...,https://www.linkedin.com/in/adithya-u-r-795866...,KTH Royal Insitute Of Technology,185018601,Gesture based control of Tello drone using python,# TelloSDKPy\r\nDJI Tello drone python interfa...
4,18,Deep-Learning-Projects,Machine Learning Enthusiast and loves to innov...,,Tata Consultancy Services,431716441,It contains Deep-Learning projects designed on...,# Deep-Learning-Projects\r\nIt contains Deep-L...


In [5]:
# Preprocessing function

def preprocess_company_name(name):

    # remove personal gmails
    email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    name = re.sub(email_regex, '', name)
    


    # Convert to lowercase
    name = name.lower()
    
    # Remove special characters and extra spaces
    name = re.sub(r'[^A-Za-z0-9&\s]', ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()  # Remove extra spaces
    
    # Trim spaces again after removing keywords
    name = name.strip()
    
    return name

# Apply the preprocessing function to the company column
github_data['cleaned_company'] = github_data['company'].apply(preprocess_company_name)

# Display the cleaned company names alongside the original
github_data[['company', 'cleaned_company']].head()

Unnamed: 0,company,cleaned_company
0,CallitAbhi,callitabhi
1,Sarvajanik College of Engineering & Technology,sarvajanik college of engineering & technology
2,IIT Guwahati,iit guwahati
3,KTH Royal Insitute Of Technology,kth royal insitute of technology
4,Tata Consultancy Services,tata consultancy services


In [6]:
# group by company name and count the number of occurrences
company_counts = github_data['cleaned_company'].value_counts()
company_counts

cleaned_company
microsoft                         2324
google                            1530
amazon                            1116
student                           1074
nvidia                             704
                                  ... 
ufopa                                1
verteva                              1
duke university health system        1
usyd master                          1
kochi university of technology       1
Name: count, Length: 50755, dtype: int64

In [7]:
# compare unique count of company names before and after cleaning
unique_companies_before = github_data['company'].nunique()
unique_companies_after = github_data['cleaned_company'].nunique()

unique_companies_before, unique_companies_after

(56429, 50755)

In [9]:
print(github_data[github_data['idd'] == 233112].to_string())

          idd                        name                                                        bio website                  company         id                                                                                                                                                                                                                           description                          readme cleaned_company
92309  233112  Slitherin-machine-learning  Data Engineer(Python) / Software Engineer(ReactJS/NextJS)     NaN  cjwicaksana15@gmail.com  262851965  in dimension 25x25 this slitherin in the data of 30 tries the slitherin can survive at the average of length 66. This is can be achieve by using A* calculation and additional data from its surrounding to calculate the best path.  Run main.py to run the program                


## Initial Sampling
get 50 samples from unique companies and investigate them.

In [8]:
# Get unique group labels
unique_groups = github_data['cleaned_company'].unique()

# Sample 50 groups
sampled_groups = pd.Series(unique_groups).sample(100, random_state=26).tolist()

# Filter the DataFrame to include only the sampled groups
sampled_df = github_data[github_data['cleaned_company'].isin(sampled_groups)]

# Get the first row of each sampled group and select the 'website' column
first_rows = sampled_df.groupby('cleaned_company').first().reset_index()

# Select the 'cleaned_company' and 'website' columns
result = first_rows.copy()

print(result.to_string(index=False))

                                                                                                                                                                                cleaned_company    idd                                                      name                                                                                                                                                                              bio                                                               website                                                                                                                                                                                                 company        id                                                                                                                                                                                                                                                                                               

In [9]:
result.to_excel('github_sample_3.xlsx', index=False)