In [1]:
# import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests

In [2]:
df = pd.read_csv("../data/data_files.csv", encoding= 'unicode_escape') # read the data

In [4]:
import re
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
import chardet # an ML model, It uses machine learning to detect the encoding of a file

# Custom transformer for case-folding
class CaseFoldingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [x.lower() for x in X]

# Custom transformer for stop words removal
class StopWordsRemovalTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        stop_words_list = stopwords.words('english')
        return [' '.join([word for word in x.split() if word not in stop_words_list]) for x in X]
    
    # Custom transformer to remove numbers from text
class NumberRemovalTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Remove numbers using regular expression
        return [re.sub(r'\d+', '', text) for text in X]
    
class UrlToContentTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for url in X:
            response = requests.get(url)
            content = response.content
            encoding = chardet.detect(content)
            if encoding['encoding'] == "ISO-8859-1":
                transformed_X.append(content.decode("iso-8859-1"))
            else:
                transformed_X.append(content.decode("utf-8"))
        return transformed_X

        
# Define the preprocessing steps
preprocessing_steps = [
#     ('case_folding', CaseFoldingTransformer()),
#     ('stop_words_removal', StopWordsRemovalTransformer()),
    ('url_to_content', UrlToContentTransformer()),
    ('number_removal', NumberRemovalTransformer()),
]

In [5]:
# Bag-of-words can be implemented using "CountVectorizer" in sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('preprocessing', Pipeline(steps=preprocessing_steps)),
    ('vectorizer', CountVectorizer(max_features=3000))
])

In [7]:
df['Content'] = pipeline['preprocessing'].fit_transform(df['URL'])

In [8]:
df['Content']

0        #description\n#author\n#timelimit\n#cmdlineops...
1        #description\n#author\n#timelimit\n#cmdlineops...
2        #description\n#author\n#timelimit\n#cmdlineops...
3        #description\n#author\n#timelimit\n#cmdlineops...
4        #description\n#author\n#timelimit\n#cmdlineops...
                               ...                        
21049    # frozen_string_literal: true\n\nrequire "abst...
21050    # frozen_string_literal: true\n\n# Order depen...
21051    # frozen_string_literal: true\n\n$: << File.ex...
21052    # frozen_string_literal: true\n\nif ENV["BUILD...
21053    # frozen_string_literal: true\n\nmodule Rails\...
Name: Content, Length: 21054, dtype: object

In [9]:
X = df['Content']
y = df['Name']

In [10]:
X_train = X
y_train = y

In [11]:
X_train_bow = pipeline['vectorizer'].fit_transform(X_train)

In [13]:
len( pipeline['vectorizer'].get_feature_names_out())

3000

In [14]:
pipeline['vectorizer'].get_feature_names_out()

array(['__', '__all__', '__call__', ..., 'zip', 'zip_test_utils', 'zone'],
      dtype=object)

In [16]:
import os

print(os.getcwd())
print(os.path.dirname(os.getcwd()))

C:\Users\kiran\Desktop\Analysis-of-GAP-programming-practices-on-GitHub\notebooks
C:\Users\kiran\Desktop\Analysis-of-GAP-programming-practices-on-GitHub


In [17]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

clf.fit(X_train_bow, y_train)

In [40]:
import pickle

# Full pipeline with preprocessing and classifier
full_pipeline = Pipeline([
    ('full_preprocessing', pipeline),
    ('classifier', clf)
])

# Save the full_pipeline object
clf_folder_path = os.path.join(os.path.dirname(os.getcwd()), "model")
os.makedirs(clf_folder_path, exist_ok=True)
clf_file_path = os.path.join(clf_folder_path, 'classifier.pkl')

with open(clf_file_path, 'wb') as f:
    pickle.dump(full_pipeline, f)


In [43]:
import joblib

# Load the saved pipeline
clf_folder_path = os.path.join(os.path.dirname(os.getcwd()), "model")
clf_file_path = os.path.join(clf_folder_path, 'classifier.pkl')
loaded_pipeline = joblib.load(clf_file_path)

# Use the loaded pipeline for predictions or other operations
predictions = loaded_pipeline.predict(['https://raw.githubusercontent.com/opencv/opencv/3aeaa3402389fc55e53fbc6f5741ca29f51032ee/modules/core/misc/java/src/java/core%2BMat.java'])

In [44]:
predictions

array(['PHP'], dtype=object)