In [1]:
import csv
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("extended.csv")
df = df.dropna(axis=1, how='any')
df.tail()

Unnamed: 0,Type,Requirement
972,FR,There will be a designated phone number that u...
973,FR,Texts sent to that number will be sent to the ...
974,FR,"If a question is not understood by our API, th..."
975,FR,Upon the USB being plugged in the system shall...
976,FR,The system shall be able to handle 1000 custom...


In [3]:
df["Tag"] = df.apply(lambda x: 1 if x["Type"] in ['F', 'FR'] else 0, axis=1)
df.head()

Unnamed: 0,Type,Requirement,Tag
0,PE,The system shall refresh the display every 60 ...,0
1,LF,The application shall match the color of the s...,0
2,US,If projected the data must be readable. On ...,0
3,A,The product shall be available during normal ...,0
4,US,If projected the data must be understandable...,0


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = df["Requirement"]
y = df["Tag"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [7]:
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [8]:
transformer = TfidfTransformer()
X_train_tfidf = transformer.fit_transform(X_train_counts)
X_test_tfidf = transformer.transform(X_test_counts)

In [9]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [10]:
models = [
    ('lr', LogisticRegression()),
    ('svm', svm.SVC(kernel="linear")),
    ('dt', tree.DecisionTreeClassifier()),
    ('rf', RandomForestClassifier())
]

In [11]:
ensemble_model = VotingClassifier(models)

In [12]:
ensemble_model.fit(X_train_tfidf, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('svm', SVC(kernel='linear')),
                             ('dt', DecisionTreeClassifier()),
                             ('rf', RandomForestClassifier())])

In [13]:
print(f"Ensemble model score: {ensemble_model.score(X_test_tfidf, y_test)}")

Ensemble model score: 0.8571428571428571
