In [1]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/Plots'

Mounted at /content/drive
/content/drive/My Drive/Plots


#Import basic libraries

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.colors as mcolors
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder



#Read the datasets

In [4]:

# Specify the file path to your CSV file
obrien = '/content/drive/My Drive/Plots/Obrien2022.csv'
vidoni = '/content/drive/My Drive/Plots/Vidoni2021.csv'
liu = '/content/drive/My Drive/Plots/LiuOnly2020.csv'

# Read the CSV file into a DataFrame
obrien = pd.read_csv(obrien, low_memory=False)
vidoni = pd.read_csv(vidoni, low_memory=False)
liu = pd.read_csv(liu, low_memory=False)
# Now dfq contains the data from your CSV file


#Create a simple Multinomial Naive Bayes

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# Load the dataset
data = vidoni

# Split the dataset into features (X) and target labels (y)
X = data['Comments']
y = data['TDType']

# Text preprocessing and feature extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [7]:
# Initialize and train the classifier (e.g., Naive Bayes)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

    ALGORITHM       0.00      0.00      0.00        47
 ARCHITECTURE       1.00      0.07      0.13        57
        BUILD       0.00      0.00      0.00        29
         CODE       0.49      1.00      0.65       428
       DEFECT       0.83      0.07      0.13       138
       DESIGN       1.00      0.02      0.05        43
DOCUMENTATION       0.00      0.00      0.00         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       1.00      0.05      0.09        62
         TEST       0.98      0.63      0.77       151
    USABILITY       0.00      0.00      0.00        19
   VERSIONING       0.00      0.00      0.00         4

     accuracy                           0.54       993
    macro avg       0.44      0.15      0.15       993
 weighted avg       0.64      0.54      0.43       993



#Grid search CV

In [8]:

from sklearn.model_selection import GridSearchCV


# Preprocessing: Convert text to lowercase and remove punctuation
data['Comments'] = data['Comments'].str.lower()
data['Comments'] = data['Comments'].str.replace('[^\w\s]', '')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['Comments'], data['TDType'], test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the classifier (Multinomial Naive Bayes)
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Evaluate the classifier
y_pred = clf.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Perform hyperparameter tuning using GridSearchCV
parameters = {'alpha': [0.01, 0.1, 1.0, 10.0]}
grid_search = GridSearchCV(clf, parameters, cv=5)
grid_search.fit(X_train_tfidf, y_train)

print("Best parameters found:", grid_search.best_params_)

# Re-train the classifier with the best parameters
best_clf = grid_search.best_estimator_
best_clf.fit(X_train_tfidf, y_train)

# Evaluate the classifier with the best parameters
y_pred_best = best_clf.predict(X_test_tfidf)
print("Accuracy with best parameters:", accuracy_score(y_test, y_pred_best))
print("Classification Report with best parameters:")
print(classification_report(y_test, y_pred_best))


Accuracy: 0.5246727089627392
Classification Report:
               precision    recall  f1-score   support

    ALGORITHM       0.00      0.00      0.00        47
 ARCHITECTURE       1.00      0.07      0.13        57
        BUILD       0.00      0.00      0.00        29
         CODE       0.48      1.00      0.64       428
       DEFECT       0.86      0.04      0.08       138
       DESIGN       0.00      0.00      0.00        43
DOCUMENTATION       0.00      0.00      0.00         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       1.00      0.03      0.06        62
         TEST       0.99      0.55      0.71       151
    USABILITY       0.00      0.00      0.00        19
   VERSIONING       0.00      0.00      0.00         4

     accuracy                           0.52       993
    macro avg       0.36      0.14      0.14       993
 weighted avg       0.59      0.52      0.41       993

Best parameters found: {'alpha': 0.1}
Accuracy with best paramete

#KNN Classiifer

In [9]:
from sklearn.neighbors import KNeighborsClassifier


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the classifier
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the classifier
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))



               precision    recall  f1-score   support

    ALGORITHM       0.29      0.04      0.07        47
 ARCHITECTURE       1.00      0.12      0.22        57
        BUILD       1.00      0.21      0.34        29
         CODE       0.48      0.98      0.65       428
       DEFECT       0.73      0.06      0.11       138
       DESIGN       0.67      0.05      0.09        43
DOCUMENTATION       1.00      0.14      0.25         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       1.00      0.15      0.25        62
         TEST       0.97      0.49      0.65       151
    USABILITY       1.00      0.05      0.10        19
   VERSIONING       0.67      0.50      0.57         4

     accuracy                           0.54       993
    macro avg       0.73      0.23      0.28       993
 weighted avg       0.68      0.54      0.44       993

Accuracy: 0.5357502517623364


## TF-IDF Implementation

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Load the dataset
data = vidoni

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Perform stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply text preprocessing to the Comments column
data['Comments'] = data['Comments'].apply(preprocess_text)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer and transform the Comments column into TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Comments'])

# Get the feature names (words) from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame to store TF-IDF scores for each word
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Add the TDType column to the DataFrame
tfidf_df['TDType'] = data['TDType']

# Group by TDType and calculate the mean TF-IDF score for each word
tfidf_scores_by_class = tfidf_df.groupby('TDType').mean()

# Get the top words for each class based on their TF-IDF scores
top_words_by_class = {}
for td_type, scores in tfidf_scores_by_class.iterrows():
    top_words_by_class[td_type] = scores.nlargest(10).index.tolist()

# Print the top words for each class
for td_type, words in top_words_by_class.items():
    print(f"Top words for class '{td_type}': {words}")


Top words for class 'ALGORITHM': ['fixm', 'todo', 'need', 'decid', 'add', 'deal', 'model', 'use', 'assum', 'data']
Top words for class 'ARCHITECTURE': ['slow', 'fixm', 'todo', 'memori', 'faster', 'speed', 'code', 'use', 'effici', 'make']
Top words for class 'BUILD': ['cmd', 'check', 'cran', 'avoid', 'note', 'packag', 'trick', 'global', 'workaround', 'hack']
Top words for class 'CODE': ['todo', 'fixm', 'need', 'use', 'check', 'hack', 'anymor', 'case', 'remov', 'work']
Top words for class 'DEFECT': ['fixm', 'work', 'fix', 'bug', 'doesnt', 'error', 'todo', 'fail', 'use', 'need']
Top words for class 'DESIGN': ['export', 'todo', 'function', 'object', 'class', 'fixm', 'need', 'use', 'list', 'gener']
Top words for class 'DOCUMENTATION': ['document', 'fixm', 'todo', 'function', 'add', 'undocu', 'develop', 'packag', 'roxygen', 'namespac']
Top words for class 'PEOPLE': ['fixmekelley', 'tom', 'todo', 'michel', 'michael', 'maxlen', 'temp', 'zero', 'stupid', 'clark']
Top words for class 'REQUIREMEN

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the classifier
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the classifier
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

               precision    recall  f1-score   support

    ALGORITHM       0.29      0.04      0.07        47
 ARCHITECTURE       1.00      0.12      0.22        57
        BUILD       1.00      0.21      0.34        29
         CODE       0.48      0.98      0.65       428
       DEFECT       0.73      0.06      0.11       138
       DESIGN       0.67      0.05      0.09        43
DOCUMENTATION       1.00      0.14      0.25         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       1.00      0.15      0.25        62
         TEST       0.97      0.49      0.65       151
    USABILITY       1.00      0.05      0.10        19
   VERSIONING       0.67      0.50      0.57         4

     accuracy                           0.54       993
    macro avg       0.73      0.23      0.28       993
 weighted avg       0.68      0.54      0.44       993

Accuracy: 0.5357502517623364


#FASTTEXT Implementation
##Partial implementation

In [14]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m41.0/68.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199774 sha256=a9ab48bdf15481c8ed43b64ea09158b17115cb9f4f17383d7e864bc89d3f82bc
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fa

In [15]:
import fasttext

# Skipgram model :
model = fasttext.train_unsupervised('train.txt', model='skipgram')

# or, cbow model :
model = fasttext.train_unsupervised('train.txt', model='cbow')


In [16]:
print(model.words)

['</s>', 'todo', 'fixm', 'use', 'need', 'nocov', 'check', 'work', 'data', 'test', 'function', 'case', 'add', 'set', 'error', 'file', 'code', 'fail', 'call', 'start', 'hack', 'make', 'remov', 'end', 'valu', 'fix', 'model', 'get', 'name', 'object', 'method', 'variabl', 'r', 'want', 'note', 'one', 'way', 'x', 'doesnt', 'time', 'handl', 'return', 'paramet', 'may', 'well', 'argument', 'column', 'better', 'dont', 'base', 'chang', 'matrix', 'could', 'line', 'first', 'also', 'list', 'avoid', 'type', 'local', 'workaround', 'na', 'warn', 'see', 'number', 'vector', 'group', 'differ', 'creat', 'like', 'new', 'user', 'sure', 'bug', 'issu', 'support', 'run', 'seem', 'trick', 'null', 'would', 'order', 'class', 'eg', 'might', 'instead', 'hope', 'allow', 'gener', 'packag', 'factor', 'encod', 'default', 'everi', 'version', 'someon', 'option', 'possibl', 'sampl', 'result', 'problem', 'utf', 'pass', 'probabl', 'select', 'think', 'true', 'miss', 'element', 'current', 'implement', 'luck', 'tri', 'done', 'cm

In [17]:
print(model['Algorithm'])

[-0.01331932 -0.02987584  0.08746994 -0.2305933   0.12873021  0.0119351
  0.04374043  0.0808204  -0.10769865 -0.00150669 -0.10667936  0.04789455
 -0.01073829  0.02297004 -0.04461262  0.08301235  0.00070967  0.09954163
  0.06798831 -0.06816439 -0.1051579   0.02863431 -0.11691461  0.19580486
  0.01560298 -0.01352675  0.1721037   0.11761338 -0.05479052  0.20277275
 -0.25742415  0.05998928 -0.04337452 -0.0603276  -0.10265287 -0.04107243
 -0.20485827 -0.17517297 -0.12700152  0.07149298  0.01205838 -0.0109686
  0.14568593  0.0186472  -0.13139684 -0.00788239 -0.05090308 -0.05570566
  0.0876175   0.04893132 -0.05922992 -0.10838698 -0.15202484  0.05487808
 -0.04670742 -0.16720279  0.00674873  0.22356516 -0.1562516   0.12374688
 -0.07978079  0.14279597  0.17816491 -0.0319544  -0.07544978 -0.02270277
  0.09238763  0.0376428   0.15416342 -0.19170085 -0.23447059 -0.19564824
  0.00139366 -0.09831797  0.10969438  0.00761874 -0.19707464  0.25467262
  0.07857753  0.10958096 -0.1598969  -0.13349546 -0.0

In [None]:
import fasttext

df = vidoni

# Preprocess the Data
# 'Comments' column contains the text data and 'TDType' column contains the labels
preprocessed_data = df.apply(lambda row: f'__label__{row["TDType"]} {row["Comments"]}', axis=1)

# Step: Save the Preprocessed Data
preprocessed_data.to_csv('train.txt', index=False, header=False, sep=' ')

# Step: Train the FastText Model
model = fasttext.train_supervised(input='train.txt', epoch=25, lr=1.0, wordNgrams=2)

# Step 5: Evaluate the Model
# Example:
# result = model.test("test.txt")
# print(result.precision)
# print(result.recall)

# Step 6: Use the Trained Model for Prediction
texts = ["Sample text 1", "Sample text 2", ...]  # List of text samples for prediction
labels = model.predict(texts)
print(labels)

#Fasttext
###Partial

In [24]:
vidoni

Unnamed: 0,Comments,TDType
0,assum curl handl,CODE
1,cacheabl,CODE
2,impact anycpubl privat controlflag,CODE
3,requir valid,CODE
4,todo might need put param,CODE
...,...,...
4956,todo remov function mvdl,CODE
4957,todo remov function mvdl,CODE
4958,obviou contradict,TEST
4959,mip sensit larg differ valu mvdlejn,TEST


In [22]:
import fasttext
from sklearn.model_selection import train_test_split

# Load dataset
data = vidoni

# Preprocess your data as needed
# For example, remove special characters, tokenize, lowercase, and remove stopwords

# Split your dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Comments'], data['TDType'], test_size=0.2, random_state=42)

# Convert your data to fastText format and save to files
train_file = 'train.txt'
test_file = 'test.txt'

with open(train_file, 'w') as f:
    for text, label in zip(X_train, y_train):
        f.write(f'__label__{label} {text}\n')

with open(test_file, 'w') as f:
    for text, label in zip(X_test, y_test):
        f.write(f'__label__{label} {text}\n')

# Train fastText model
model = fasttext.train_supervised(input=train_file, epoch=25, lr=0.1, wordNgrams=2, dim=100, loss='softmax')

# Evaluate model
result = model.test(test_file)
print(f"Precision: {result[1]:.2f}")
print(f"Recall: {result[2]:.2f}")
print(f"F1 Score: {result[1] * result[2] * 2 / (result[1] + result[2]):.2f}")


Precision: 0.63
Recall: 0.63
F1 Score: 0.63


#Binary Classification Using Random Forest

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.metrics import classification_report

# Load dataset
data = vidoni

# Preprocessing
X = data['Comments']
y_multi = data['TDType']
y_binary = (data['TDType'] == 'ALGORITHM').astype(int)

# Split data
X_train, X_test, y_multi_train, y_multi_test = train_test_split(X, y_multi, test_size=0.2, random_state=42)
_, _, y_binary_train, y_binary_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Multi-label classification
mlb = MultiLabelBinarizer()
y_multi_train_encoded = mlb.fit_transform(y_multi_train.str.split(', '))
y_multi_test_encoded = mlb.transform(y_multi_test.str.split(', '))

multi_clf = MultiOutputClassifier(RandomForestClassifier())
multi_clf.fit(X_train_tfidf, y_multi_train_encoded)
multi_pred = multi_clf.predict(X_test_tfidf)

# Binary classification for ALGORITHM detection
le = LabelEncoder()
y_binary_train_encoded = le.fit_transform(y_binary_train)
y_binary_test_encoded = le.transform(y_binary_test)

binary_clf = RandomForestClassifier()
binary_clf.fit(X_train_tfidf, y_binary_train_encoded)
binary_pred = binary_clf.predict(X_test_tfidf)

# Evaluation
print("Multi-label classification report:\n", classification_report(y_multi_test_encoded, multi_pred))
print("Binary classification report for ALGORITHM detection:\n", classification_report(y_binary_test_encoded, binary_pred))


Multi-label classification report:
               precision    recall  f1-score   support

           0       0.67      0.13      0.21        47
           1       0.83      0.42      0.56        57
           2       1.00      0.31      0.47        29
           3       0.71      0.72      0.72       428
           4       0.76      0.30      0.43       138
           5       0.89      0.19      0.31        43
           6       1.00      0.29      0.44         7
           7       0.00      0.00      0.00         8
           8       0.93      0.23      0.36        62
           9       0.98      0.81      0.89       151
          10       1.00      0.11      0.19        19
          11       0.67      0.50      0.57         4

   micro avg       0.78      0.54      0.64       993
   macro avg       0.79      0.33      0.43       993
weighted avg       0.79      0.54      0.60       993
 samples avg       0.54      0.54      0.54       993

Binary classification report for ALGORITHM 

#TF-IDF implementation --Random Forest

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load dataset
data = vidoni

# Preprocessing
X = data['Comments']
y = data['TDType']

# Label encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

# Model training
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Prediction
y_pred = clf.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Decode predictions and ground truth labels
y_pred_names = [label_mapping[label] for label in y_pred]
y_test_names = [label_mapping[label] for label in y_test]

# Print classification report with TDType names
print("Classification Report:")
print(classification_report(y_test_names, y_pred_names))


Accuracy: 0.6636455186304129
Classification Report:
               precision    recall  f1-score   support

    ALGORITHM       0.70      0.15      0.25        47
 ARCHITECTURE       0.87      0.35      0.50        57
        BUILD       1.00      0.34      0.51        29
         CODE       0.60      0.94      0.73       428
       DEFECT       0.58      0.39      0.47       138
       DESIGN       0.83      0.23      0.36        43
DOCUMENTATION       0.57      0.57      0.57         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       0.77      0.27      0.40        62
         TEST       0.96      0.85      0.90       151
    USABILITY       1.00      0.16      0.27        19
   VERSIONING       0.67      0.50      0.57         4

     accuracy                           0.66       993
    macro avg       0.71      0.40      0.46       993
 weighted avg       0.71      0.66      0.63       993



#TF-IDF--with Binary classifier

In [28]:

# Load dataset
data = vidoni

# Preprocessing
X = data['Comments']
y = data['TDType']

# Label encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

# Binary classification for "ALGORITHM"
y_train_algorithm = (y_train == label_encoder.transform(['ALGORITHM'])[0])
y_test_algorithm = (y_test == label_encoder.transform(['ALGORITHM'])[0])

# Model training for binary classification
clf_algorithm = RandomForestClassifier()
clf_algorithm.fit(X_train, y_train_algorithm)

# Prediction for binary classification
y_pred_algorithm = clf_algorithm.predict(X_test)

# Evaluation for binary classification
accuracy_algorithm = accuracy_score(y_test_algorithm, y_pred_algorithm)
print("Binary Classification (ALGORITHM) Accuracy:", accuracy_algorithm)
print(classification_report(y_test_algorithm, y_pred_algorithm))

# Model training for multi-label classification
clf_multilabel = RandomForestClassifier()
clf_multilabel.fit(X_train, y_train)

# Prediction for multi-label classification
y_pred_multilabel = clf_multilabel.predict(X_test)

# Evaluation for multi-label classification
accuracy_multilabel = accuracy_score(y_test, y_pred_multilabel)
print("Multi-label Classification Accuracy:", accuracy_multilabel)
print(classification_report(y_test, y_pred_multilabel))


Binary Classification (ALGORITHM) Accuracy: 0.9556898288016112
              precision    recall  f1-score   support

       False       0.96      1.00      0.98       946
        True       0.67      0.13      0.21        47

    accuracy                           0.96       993
   macro avg       0.81      0.56      0.60       993
weighted avg       0.94      0.96      0.94       993

Multi-label Classification Accuracy: 0.6626384692849949
              precision    recall  f1-score   support

           0       0.78      0.15      0.25        47
           1       0.96      0.39      0.55        57
           2       0.91      0.34      0.50        29
           3       0.60      0.94      0.73       428
           4       0.55      0.38      0.45       138
           5       0.90      0.21      0.34        43
           6       0.50      0.57      0.53         7
           7       0.00      0.00      0.00         8
           8       0.72      0.29      0.41        62
           9 

## NTLK


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
import spacy

# Download NLTK resources
nltk.download('punkt')

# Load English language model for spaCy
nlp = spacy.load("en_core_web_sm")

# Step 1: Load the dataset
data = vidoni

# Step 2: Data Preprocessing
# You may need to perform additional preprocessing steps here based on your data characteristics

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Comments'], data['TDType'], test_size=0.2, random_state=42)

# Step 4: Feature Extraction (TF-IDF Vectorization)
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  # You can adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Model Selection and Training
model = MultinomialNB()  # You can choose a different classifier based on your requirements
model.fit(X_train_tfidf, y_train)

# Step 6: Model Evaluation
y_pred = model.predict(X_test_tfidf)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Named Entity Recognition with spaCy
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Example usage
text = "The company Apple Inc. is based in California."
print("Named Entities:")
print(extract_entities(text))

# Markov model with NLTK
class MarkovModel:
    def __init__(self):
        self.model = None

    def train(self, text):
        tokens = word_tokenize(text)
        bigrams = list(nltk.bigrams(tokens))
        self.model = nltk.ConditionalFreqDist(bigrams)

    def generate_text(self, seed, num_words=10):
        if self.model is None:
            raise ValueError("Model not trained")
        word = seed
        text = [word]
        for _ in range(num_words):
            if word not in self.model:
                break
            word = self.model[word].max()
            text.append(word)
        return ' '.join(text)

# Example usage
text = "This is a sample text for training the Markov model."
markov_model = MarkovModel()
markov_model.train(text)
generated_text = markov_model.generate_text("This")
print("Generated Text:")
print(generated_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Classification Report:
               precision    recall  f1-score   support

    ALGORITHM       0.00      0.00      0.00        47
 ARCHITECTURE       1.00      0.05      0.10        57
        BUILD       0.00      0.00      0.00        29
         CODE       0.49      1.00      0.65       428
       DEFECT       0.82      0.07      0.12       138
       DESIGN       1.00      0.02      0.05        43
DOCUMENTATION       0.00      0.00      0.00         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       1.00      0.05      0.09        62
         TEST       0.97      0.64      0.77       151
    USABILITY       0.00      0.00      0.00        19
   VERSIONING       0.00      0.00      0.00         4

     accuracy                           0.54       993
    macro avg       0.44      0.15      0.15       993
 weighted avg       0.63      0.54      0.43       993

Confusion Matrix:
[[  0   0   0  47   0   0   0   0   0   0   0   0]
 [  0   3   0  54   0   0

#feature representation using Word2Vec

#Markov Model

In [32]:
pip install markovify

Collecting markovify
  Downloading markovify-0.9.4.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unidecode (from markovify)
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: markovify
  Building wheel for markovify (setup.py) ... [?25l[?25hdone
  Created wheel for markovify: filename=markovify-0.9.4-py3-none-any.whl size=18606 sha256=eb5c85967fae55646ce4665fa5cd967f442cd26b52474d8087051b5b1c376c54
  Stored in directory: /root/.cache/pip/wheels/ca/8c/c5/41413e24c484f883a100c63ca7b3b0362b7c6f6eb6d7c9cc7f
Successfully built markovify
Installing collected packages: unidecode, markovify
Successfully installed markovify-0.9.4 unidecode-1.3.8


#Named Entity RECOG

In [34]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [35]:
import pandas as pd

# Specify the file path to your CSV file
obrien = '/content/drive/My Drive/Plots/Obrien2022.csv'
vidoni = '/content/drive/My Drive/Plots/Vidoni2021.csv'
liu = '/content/drive/My Drive/Plots/LiuOnly2020.csv'

# Read the CSV file into a DataFrame
obrien = pd.read_csv(obrien, low_memory=False)
vidoni = pd.read_csv(vidoni, low_memory=False)
liu = pd.read_csv(liu, low_memory=False)
# Now dfq contains the data from your CSV file


In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from nltk import ne_chunk, word_tokenize, pos_tag
from nltk.tree import Tree

# Load the dataset
data = vidoni

# Define a function to extract named entities using NLTK's NER
def extract_named_entities(text):
    entities = []
    for chunk in ne_chunk(pos_tag(word_tokenize(text))):
        if isinstance(chunk, Tree):
            entities.append(" ".join([token for token, pos in chunk.leaves()]))
    return " ".join(entities)

# Apply the function to extract named entities from the Comments column
data['NamedEntities'] = data['Comments'].apply(extract_named_entities)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['NamedEntities'], data['TDType'], test_size=0.2, random_state=42)

# Convert named entities into features using CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Train a Support Vector Classifier (SVC) model
model = SVC()
model.fit(X_train_counts, y_train)

# Evaluate the model
y_pred = model.predict(X_test_counts)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

    ALGORITHM       0.50      0.06      0.11        47
 ARCHITECTURE       1.00      0.05      0.10        57
        BUILD       0.57      0.14      0.22        29
         CODE       0.45      0.97      0.61       428
       DEFECT       0.47      0.07      0.11       138
       DESIGN       0.67      0.14      0.23        43
DOCUMENTATION       0.00      0.00      0.00         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       0.27      0.05      0.08        62
         TEST       0.50      0.01      0.03       151
    USABILITY       0.00      0.00      0.00        19
   VERSIONING       0.00      0.00      0.00         4

     accuracy                           0.45       993
    macro avg       0.37      0.12      0.13       993
 weighted avg       0.48      0.45      0.32       993



In [40]:

from sklearn.svm import SVC
import spacy

# Load the dataset
data = vidoni

# Preprocess the text data
data['Comments'] = data['Comments'].str.lower()

# Load the English language model for NER using spaCy
nlp = spacy.load('en_core_web_sm')

# Function to extract named entities from text using spaCy
def extract_named_entities(text):
    doc = nlp(text)
    named_entities = [ent.text for ent in doc.ents]
    return ' '.join(named_entities)

# Apply NER to extract named entities from the comments
data['NamedEntities'] = data['Comments'].apply(extract_named_entities)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['NamedEntities'], data['TDType'], test_size=0.2, random_state=42)

# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data to TF-IDF vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data to TF-IDF vectors
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize an SVM classifier
svm_classifier = SVC(kernel='linear')

# Train the SVM classifier on the TF-IDF vectors
svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
predictions = svm_classifier.predict(X_test_tfidf)

# Print the classification report
print(classification_report(y_test, predictions))


               precision    recall  f1-score   support

    ALGORITHM       1.00      0.02      0.04        47
 ARCHITECTURE       0.00      0.00      0.00        57
        BUILD       0.00      0.00      0.00        29
         CODE       0.44      1.00      0.61       428
       DEFECT       0.64      0.05      0.09       138
       DESIGN       0.00      0.00      0.00        43
DOCUMENTATION       0.00      0.00      0.00         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       0.00      0.00      0.00        62
         TEST       0.29      0.01      0.03       151
    USABILITY       0.00      0.00      0.00        19
   VERSIONING       1.00      0.25      0.40         4

     accuracy                           0.44       993
    macro avg       0.28      0.11      0.10       993
 weighted avg       0.37      0.44      0.28       993



In [None]:
data

#unigram and bigram

In [42]:

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
df = vidoni

# Feature Representation
# Treat each comment as a document and create a feature representation
comments = df['Comments']
td_types = df['TDType']

# Generate an alphabetical list of unigrams and count occurrence of each token
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(comments)

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, td_types, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Evaluate Model Performance
# Calculate confusion matrix for resubstitution error
y_pred_train = classifier.predict(X_train)
conf_matrix_train = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix (Training):\n", conf_matrix_train)

# Evaluate the model on validation set
y_pred_val = classifier.predict(X_val)
print("Classification Report (Validation):\n", classification_report(y_val, y_pred_val))

# Evaluate the model on test set
y_pred_test = classifier.predict(X_test)
print("Classification Report (Test):\n", classification_report(y_test, y_pred_test))


Confusion Matrix (Training):
 [[  67    1    0  135    3    0    0    0    2    1    0    0]
 [   0  107    0   90    5    0    0    0    0    1    0    0]
 [   0    0   29   70    5    0    0    0    1    4    0    0]
 [   0    1    0 1372   11    0    0    0    2    2    0    0]
 [   0    0    0  128  351    0    0    0    1    3    0    0]
 [   0    0    0  111    6   34    0    0    2    1    0    0]
 [   0    0    0   29    7    0    5    0    0    1    0    0]
 [   0    0    0   10    2    0    0    0    0    0    0    0]
 [   0    0    0  133    6    0    0    0  116    1    0    0]
 [   0    1    0   85   12    0    0    0    0  453    0    0]
 [   1    0    0   41    1    0    0    0    0    2    4    0]
 [   0    0    0   14    1    0    0    0    0    0    0    1]]
Classification Report (Validation):
                precision    recall  f1-score   support

    ALGORITHM       0.40      0.06      0.10        36
 ARCHITECTURE       0.60      0.17      0.26        36
        BU

#Word2VEc embeddings

In [43]:
# Load the dataset
df = vidoni

# Feature Representation
# Treat each comment as a document and create a feature representation
comments = df['Comments']
td_types = df['TDType']

# Generate an alphabetical list of unigrams and count occurrence of each token
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(comments)

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, td_types, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Evaluate Model Performance
# Calculate confusion matrix for resubstitution error
y_pred_train = classifier.predict(X_train)
conf_matrix_train = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix (Training):\n", conf_matrix_train)

# Evaluate the model on validation set
y_pred_val = classifier.predict(X_val)
print("Classification Report (Validation):\n", classification_report(y_val, y_pred_val))

# Evaluate the model on test set
y_pred_test = classifier.predict(X_test)
print("Classification Report (Test):\n", classification_report(y_test, y_pred_test))


Confusion Matrix (Training):
 [[  67    1    0  135    3    0    0    0    2    1    0    0]
 [   0  107    0   90    5    0    0    0    0    1    0    0]
 [   0    0   29   70    5    0    0    0    1    4    0    0]
 [   0    1    0 1372   11    0    0    0    2    2    0    0]
 [   0    0    0  128  351    0    0    0    1    3    0    0]
 [   0    0    0  111    6   34    0    0    2    1    0    0]
 [   0    0    0   29    7    0    5    0    0    1    0    0]
 [   0    0    0   10    2    0    0    0    0    0    0    0]
 [   0    0    0  133    6    0    0    0  116    1    0    0]
 [   0    1    0   85   12    0    0    0    0  453    0    0]
 [   1    0    0   41    1    0    0    0    0    2    4    0]
 [   0    0    0   14    1    0    0    0    0    0    0    1]]
Classification Report (Validation):
                precision    recall  f1-score   support

    ALGORITHM       0.40      0.06      0.10        36
 ARCHITECTURE       0.60      0.17      0.26        36
        BU

#WORD2VEC MODEL

In [44]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Load your dataset
df = vidoni

# Tokenize the comments
tokenized_comments = [word_tokenize(comment.lower()) for comment in df['Comments']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_comments, vector_size=100, window=5, min_count=1, workers=4)

# Save the trained model
word2vec_model.save('word2vec_model.bin')

In [45]:
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression


# Load your dataset with labeled data (Comments and TDType)
df = vidoni

# Tokenize the comments
tokenized_comments = [word_tokenize(comment.lower()) for comment in df['Comments']]

# Load the pre-trained Word2Vec model
word2vec_model = Word2Vec.load('word2vec_model.bin')

# Generate word embeddings for each comment
comment_embeddings = []
for tokens in tokenized_comments:
    embeddings = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if embeddings:
        comment_embeddings.append(sum(embeddings) / len(embeddings))
    else:
        # If no word in the comment is present in the Word2Vec vocabulary, use a zero vector
        comment_embeddings.append([0] * word2vec_model.vector_size)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(comment_embeddings, df['TDType'], test_size=0.2, random_state=42)

# Train a classifier (e.g., Logistic Regression)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)


               precision    recall  f1-score   support

    ALGORITHM       0.00      0.00      0.00        47
 ARCHITECTURE       0.00      0.00      0.00        57
        BUILD       0.00      0.00      0.00        29
         CODE       0.47      1.00      0.64       428
       DEFECT       0.50      0.03      0.05       138
       DESIGN       0.00      0.00      0.00        43
DOCUMENTATION       0.00      0.00      0.00         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       0.00      0.00      0.00        62
         TEST       0.96      0.48      0.64       151
    USABILITY       0.00      0.00      0.00        19
   VERSIONING       0.00      0.00      0.00         4

     accuracy                           0.51       993
    macro avg       0.16      0.13      0.11       993
 weighted avg       0.42      0.51      0.38       993



#Binary Classification

In [46]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = vidoni

# Preprocess the data
df['Comments'] = df['Comments'].apply(lambda x: x.lower())  # Convert to lowercase
df['Comments'] = df['Comments'].apply(lambda x: ' '.join(word_tokenize(x)))  # Tokenize

# Define the target variable
df['IsAlgorithm'] = (df['TDType'] == 'ALGORITHM').astype(int)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Comments'], df['IsAlgorithm'], test_size=0.2, random_state=42)

# Load the pre-trained Word2Vec model
word2vec_model = Word2Vec.load('word2vec_model.bin')

# Generate word embeddings for each comment
def generate_embeddings(comment):
    tokens = word_tokenize(comment)
    embeddings = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if embeddings:
        return sum(embeddings) / len(embeddings)
    else:
        return [0] * word2vec_model.vector_size

X_train_embeddings = X_train.apply(generate_embeddings)
X_test_embeddings = X_test.apply(generate_embeddings)

# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(list(X_train_embeddings), y_train)

# Make predictions on the test set
y_pred = classifier.predict(list(X_test_embeddings))

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

classification_report = classification_report(y_test, y_pred)
print(classification_report)


Accuracy: 0.9526686807653575
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       946
           1       0.00      0.00      0.00        47

    accuracy                           0.95       993
   macro avg       0.48      0.50      0.49       993
weighted avg       0.91      0.95      0.93       993



In [None]:
print(df['IsAlgorithm'].unique())

#Experimenting with with  feature extraction techniques and vectorizers

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load your dataset

df = vidoni

# Preprocess the data
df['Comments'] = df['Comments'].apply(lambda x: x.lower())  # Convert to lowercase
df['Comments'] = df['Comments'].apply(lambda x: ' '.join(word_tokenize(x)))  # Tokenize

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Comments'], df['TDType'], test_size=0.2, random_state=42)

# Define different vectorizers
vectorizers = {
    'CountVectorizer': CountVectorizer(),
    'TfidfVectorizer': TfidfVectorizer(),
    'HashingVectorizer': HashingVectorizer()
}

# Experiment with different vectorizers and feature extraction techniques
for vectorizer_name, vectorizer in vectorizers.items():
    print(f"Experimenting with {vectorizer_name}...")

    # Transform the text data using the vectorizer
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    # Train a simple classifier (Logistic Regression) on the vectorized data
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_vectorized, y_train)

    # Evaluate the classifier on the test set
    y_pred = clf.predict(X_test_vectorized)

    # Print classification report
    print(f"Classification Report for {vectorizer_name}:")
    print(classification_report(y_test, y_pred))
    print("="*50)


Experimenting with CountVectorizer...
Classification Report for CountVectorizer:
               precision    recall  f1-score   support

    ALGORITHM       0.39      0.28      0.33        47
 ARCHITECTURE       0.69      0.39      0.49        57
        BUILD       0.73      0.38      0.50        29
         CODE       0.65      0.87      0.74       428
       DEFECT       0.51      0.46      0.49       138
       DESIGN       0.48      0.23      0.31        43
DOCUMENTATION       0.67      0.29      0.40         7
       PEOPLE       0.00      0.00      0.00         8
 REQUIREMENTS       0.50      0.34      0.40        62
         TEST       0.92      0.81      0.86       151
    USABILITY       0.62      0.26      0.37        19
   VERSIONING       0.50      0.50      0.50         4

     accuracy                           0.65       993
    macro avg       0.55      0.40      0.45       993
 weighted avg       0.64      0.65      0.63       993

Experimenting with TfidfVectorizer..

In [None]:
pip install hmmlearn

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Step 1: Load the dataset
data = vidoni

# Step 2: Preprocessing
# Your preprocessing code here...

# Step 3: Feature Extraction
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(data['Comments'])
y = data['TDType']

# Step 4: Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = hmm.GaussianHMM(n_components=10, covariance_type="full", n_iter=100)
model.fit(X_train.toarray())

# Step 5: Evaluation
y_pred = model.predict(X_test.toarray())
print(classification_report(y_test, y_pred))