In [None]:
!pip install transformers datasets --quiet

In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('./combined_dataset_with_stars.csv')

# Create a new 'category' column based on the 'type' column
data['category'] = data['type'].apply(lambda x: 'code' if x == 'code_comment' else 'issue')

SAMPLES_PER_REPO_PER_CATEGORY = 1500

# Calculate the initial counts per repo and category
initial_counts = data.groupby(['repo', 'category']).size().unstack(fill_value=0)

# Define the sampling function
def sample_data(group):
    # Separate the data into code and issue categories
    code_data = group[group['category'] == 'code']
    issue_data = group[group['category'] == 'issue']

    # Randomly sample the data based on the given limits
    sampled_code = code_data.sample(n=min(len(code_data), SAMPLES_PER_REPO_PER_CATEGORY), random_state=42)
    sampled_issue = issue_data.sample(n=min(len(issue_data), SAMPLES_PER_REPO_PER_CATEGORY), random_state=42)

    # Concatenate the sampled data and return
    return pd.concat([sampled_code, sampled_issue])

# Apply the sampling function to each group
filtered_data = data.groupby('repo').apply(sample_data).reset_index(drop=True)

# Count the sampled result per repo and category
sampled_counts = filtered_data.groupby(['repo', 'category']).size().unstack(fill_value=0)

# Combine the initial and sampled counts into one DataFrame
combined_counts = pd.DataFrame({
    'issue_comments': initial_counts['issue'],
    'code_comments': initial_counts['code'],
    'sampled_issue_comments': sampled_counts['issue'],
    'sampled_code_comments': sampled_counts['code']
})

print("Total (sampled) dataset size", len(filtered_data))
combined_counts


Total (sampled) dataset size 27315


Unnamed: 0_level_0,issue_comments,code_comments,sampled_issue_comments,sampled_code_comments
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Auto1111SDK/Auto1111SDK,18,11,18,11
Zulko/moviepy,1017,701,1017,701
bigskysoftware/htmx,702,5001,702,1500
django/django,411,31131,411,1500
elnormous/HTTPRequest,41,5,41,5
ethereum/go-ethereum,1140,10587,1140,1500
facebook/react,5308,32832,1500,1500
hotwired/turbo,616,19,616,19
joshmoody24/sitcom-simulator,5,230,5,230
nodejs/nodejs.org,153,346,153,346


In [None]:
from transformers import pipeline
import pandas as pd
from tqdm.auto import tqdm
import torch

data = filtered_data

# Initialize the pipeline for zero-shot classification
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=0 if torch.cuda.is_available() else -1
)

# Initialize the pipeline for sentiment analysis
sentiment_classifier = pipeline(
    "text-classification",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=0 if torch.cuda.is_available() else -1
)

code_comment_categories = ["explanation", "future work", "deprecated"]
issue_comment_categories = ["bug report", "feature request", "question", "solution", "discussion", "conclusion"]

def save_to_csv(df, filename='data_backup.csv'):
    df.to_csv(filename, index=False)
    print(f'Data saved to {filename}')

def classify_texts(classifier, texts, categories=None, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        if categories:
            batch_results = classifier(batch_texts, candidate_labels=categories, truncation=True)
            batch_labels = [result['labels'][0] for result in batch_results]
            results.extend(batch_labels)
        else:
            batch_results = classifier(batch_texts, truncation=True)
            batch_labels = [result['label'] for result in batch_results]
            results.extend(batch_labels)
        if i % 100 == 0:
          save_to_csv(data, f'data_backup_{i // batch_size}.csv')  # Save progress after each batch
    return results

texts = data['text'].tolist()
types = data['type'].tolist()

# Classify code and issue comments separately to handle categories
code_comments = [text for text, type in zip(texts, types) if type == 'code_comment']
issue_comments = [text for text, type in zip(texts, types) if type != 'code_comment']

all_intents_code = classify_texts(zero_shot_classifier, code_comments, code_comment_categories)
all_intents_issue = classify_texts(zero_shot_classifier, issue_comments, issue_comment_categories)

# Combine the results back into the order of the original list
all_intents = []
code_idx = 0
issue_idx = 0
for type in types:
    if type == 'code_comment':
        all_intents.append(all_intents_code[code_idx])
        code_idx += 1
    else:
        all_intents.append(all_intents_issue[issue_idx])
        issue_idx += 1

data['purpose'] = all_intents
save_to_csv(data, 'data_with_purpose.csv')  # Save after classifying intents

# Classify sentiment
all_sentiment = classify_texts(sentiment_classifier, texts)
data['sentiment_class'] = all_sentiment
save_to_csv(data, 'data_with_sentiment.csv')  # Save after classifying sentiments

data[['repo', 'category', 'text', 'sentiment_class', 'purpose']]


  0%|          | 0/514 [00:00<?, ?it/s]

Data saved to data_backup_0.csv




Data saved to data_backup_25.csv
Data saved to data_backup_50.csv
Data saved to data_backup_75.csv
Data saved to data_backup_100.csv
Data saved to data_backup_125.csv
Data saved to data_backup_150.csv
Data saved to data_backup_175.csv
Data saved to data_backup_200.csv
Data saved to data_backup_225.csv
Data saved to data_backup_250.csv
Data saved to data_backup_275.csv
Data saved to data_backup_300.csv
Data saved to data_backup_325.csv
Data saved to data_backup_350.csv
Data saved to data_backup_375.csv
Data saved to data_backup_400.csv
Data saved to data_backup_425.csv
Data saved to data_backup_450.csv
Data saved to data_backup_475.csv
Data saved to data_backup_500.csv


  0%|          | 0/341 [00:00<?, ?it/s]

Data saved to data_backup_0.csv
Data saved to data_backup_25.csv
Data saved to data_backup_50.csv
Data saved to data_backup_75.csv
Data saved to data_backup_100.csv
Data saved to data_backup_125.csv
Data saved to data_backup_150.csv
Data saved to data_backup_175.csv
Data saved to data_backup_200.csv
Data saved to data_backup_225.csv
Data saved to data_backup_250.csv
Data saved to data_backup_275.csv
Data saved to data_backup_300.csv
Data saved to data_backup_325.csv
Data saved to data_with_purpose.csv


  0%|          | 0/854 [00:00<?, ?it/s]

Data saved to data_backup_0.csv




Data saved to data_backup_25.csv
Data saved to data_backup_50.csv
Data saved to data_backup_75.csv
Data saved to data_backup_100.csv
Data saved to data_backup_125.csv
Data saved to data_backup_150.csv
Data saved to data_backup_175.csv
Data saved to data_backup_200.csv
Data saved to data_backup_225.csv
Data saved to data_backup_250.csv
Data saved to data_backup_275.csv
Data saved to data_backup_300.csv
Data saved to data_backup_325.csv
Data saved to data_backup_350.csv
Data saved to data_backup_375.csv
Data saved to data_backup_400.csv
Data saved to data_backup_425.csv
Data saved to data_backup_450.csv
Data saved to data_backup_475.csv
Data saved to data_backup_500.csv
Data saved to data_backup_525.csv
Data saved to data_backup_550.csv
Data saved to data_backup_575.csv
Data saved to data_backup_600.csv
Data saved to data_backup_625.csv
Data saved to data_backup_650.csv
Data saved to data_backup_675.csv
Data saved to data_backup_700.csv
Data saved to data_backup_725.csv
Data saved to dat

Unnamed: 0,repo,category,text,sentiment_class,purpose
0,Auto1111SDK/Auto1111SDK,code,if model_path is None:,NEGATIVE,explanation
1,Auto1111SDK/Auto1111SDK,code,"pipe = StableDiffusionPipeline(""model.safetens...",NEGATIVE,explanation
2,Auto1111SDK/Auto1111SDK,code,"""mask"": ""Placeholder for mask array data""",NEGATIVE,explanation
3,Auto1111SDK/Auto1111SDK,code,Check if the URL format is correct,NEGATIVE,explanation
4,Auto1111SDK/Auto1111SDK,code,"negative_prompt = ""(deformed iris, deformed pu...",NEGATIVE,explanation
...,...,...,...,...,...
27310,zzzprojects/System.Linq.Dynamic.Core,issue,@isabekyan \r\nThe purpose is not really relev...,NEGATIVE,discussion
27311,zzzprojects/System.Linq.Dynamic.Core,issue,@StefH Addressed your comment and added test c...,POSITIVE,discussion
27312,zzzprojects/System.Linq.Dynamic.Core,issue,I'm not exactly sure but this answer might hel...,POSITIVE,discussion
27313,zzzprojects/System.Linq.Dynamic.Core,issue,We will wait for the 6.0 release for a complet...,NEGATIVE,discussion


In [None]:
data[['text', 'purpose']][1000:1500]

Unnamed: 0,text,purpose
1000,"@tillea Hi Andreas, thanks for all the info.\r...",discussion
1001,"Well, that's the way I know. Let me know if yo...",conclusion
1002,This sounds like an issue with (named) imports...,question
1003,> If in the future becomes useful rather than ...,discussion
1004,I have got the same error when trying to use i...,question
...,...,...
1495,"Thanks for reporting the bug, unfortunately it...",bug report
1496,Confirmed. I have the same problem.,conclusion
1497,"What do you mean by ""after rendering"" ? Even a...",question
1498,Just came across this while looking through is...,question


In [None]:
data.to_csv(f'final_dataset_samplesize_{SAMPLES_PER_REPO_PER_CATEGORY}.csv')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Assuming 'data' is already loaded and contains the text column
text_data = data['text']

# Helper function to get top n n-grams
def get_top_ngrams(corpus, n=None, ngrams=1):
    vec = CountVectorizer(stop_words='english', ngram_range=(ngrams, ngrams), max_features=2000).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Get top 10 unigrams, bigrams, and trigrams
top_n = 20
top_unigrams = get_top_ngrams(text_data, n=top_n, ngrams=1)
top_bigrams = get_top_ngrams(text_data, n=top_n, ngrams=2)
top_trigrams = get_top_ngrams(text_data, n=top_n, ngrams=3)

print(f"Top {top_n} Unigrams:")
for word, freq in top_unigrams:
    print(f"{word}: {freq}")

print(f"\nTop {top_n} Bigrams:")
for word, freq in top_bigrams:
    print(f"{word}: {freq}")

print(f"\nTop {top_n} Trigrams:")
for word, freq in top_trigrams:
    print(f"{word}: {freq}")


Top 20 Unigrams:
issue: 2099
use: 1773
like: 1483
just: 1391
think: 1286
don: 1197
need: 1195
work: 1092
using: 1088
code: 1063
https: 1008
pr: 955
case: 931
error: 927
com: 891
thanks: 869
new: 857
test: 856
make: 849
time: 820

Top 20 Bigrams:
github com: 642
https github: 575
don think: 177
looks like: 168
use case: 155
make sure: 152
don know: 126
let know: 118
makes sense: 104
pull request: 98
com rails: 94
rails rails: 94
don want: 88
make sense: 87
feel free: 68
doesn work: 68
test case: 67
breaking change: 64
rails pull: 63
com facebook: 58

Top 20 Trigrams:
https github com: 575
github com rails: 94
com rails rails: 92
rails rails pull: 62
github com facebook: 58
com facebook react: 57
github com hotwired: 49
github com vuejs: 48
com hotwired turbo: 47
github com ethereum: 41
com ethereum ethereum: 35
copyright 2010 pallets: 35
https codepen io: 32
pr https github: 31
github com twbs: 31
com twbs bootstrap: 30
https gist github: 29
gist github com: 29
hotwired turbo pull: 29
c