In [1]:
import config
import pandas as pd
import pickle

In [2]:
train = pd.read_pickle(config.TRAIN)
test = pd.read_pickle(config.TEST)
X_train, y_train = train["problem_statement"], train["tags"]
X_test, y_test = test["problem_statement"], test["tags"]

print(X_train.tail())
print(y_train.tail())
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

990C     also integ "+" oper suitabl bracket )", exceed...
1154C    certain start week integ ration stake input ca...
339C     otherwis integ play outweigh assum 1000 two te...
909D     integ appli mani input oper two how point ther...
582E     also remaind integ & )'. oper consid two take ...
Name: problem_statement, dtype: object
990C                                      [implementation]
1154C                               [implementation, math]
339C     [constructivealgorithms, dfsandsimilar, dp, gr...
909D              [datastructures, greedy, implementation]
582E                     [bitmasks, dp, expressionparsing]
Name: tags, dtype: object
(6272,) (6272,)
(1568,) (1568,)


In [3]:
# multilabel binarizer
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train_onehot = pd.DataFrame(mlb.fit_transform(y_train),
    columns=mlb.classes_, index=y_train.index)
y_test_onehot = pd.DataFrame(mlb.fit_transform(y_test),
    columns=mlb.classes_, index=y_test.index)

print(mlb.classes_)
tags = mlb.classes_

print(y_train_onehot.shape)
print(X_train.shape, y_train_onehot["implementation"].shape)


['2-sat' 'binarysearch' 'bitmasks' 'bruteforce' 'chineseremaindertheorem'
 'combinatorics' 'constructivealgorithms' 'datastructures' 'dfsandsimilar'
 'divideandconquer' 'dp' 'dsu' 'expressionparsing' 'fft' 'flows' 'games'
 'geometry' 'graphmatchings' 'graphs' 'greedy' 'hashing' 'implementation'
 'interactive' 'math' 'matrices' 'meet-in-the-middle' 'nan' 'numbertheory'
 'probabilities' 'schedules' 'shortestpaths' 'sortings' 'strings'
 'stringsuffixstructures' 'ternarysearch' 'trees' 'twopointers']
(6272, 37)
(6272,) (6272,)


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features=config.MAX_FEATURES)
X_train_counts = count_vect.fit_transform(X_train)
print(X_train_counts.shape)
X_test_counts = count_vect.transform(X_test)
print(X_test_counts.shape)


(6272, 200)
(1568, 200)


In [5]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

def tags_as_str(y_pred):
    y_ans = []
    for i in range(len(y_pred)):
        if y_pred[i]: 
            y_ans.append(tags[i])
    return y_ans



def predict(ii):
    X_test_ii = X_test_counts[ii]
    y_preds = []
    print(X_test_ii.shape)
    for tag in tags:
        clf.fit(X_train_counts, y_train_onehot[tag])
        y_pred = clf.predict(X_test_ii)
        y_preds.extend(y_pred)
    # print(y_preds)
    # print(list(y_test_onehot.iloc[ii]))
    # print(y_test[ii])
    return tags_as_str(y_preds)

print(predict(1000), y_test.iloc[1000])

(1, 200)
[] ['graphs', 'implementation']


In [6]:
from preprocess import preprocess_text
samples = ["this is a stupid dynamic programming probelm that you cannot solve with graph tricks.",
"can you solve me?"]
X_clean = preprocess_text(samples[0])
print(X_clean)

vectorizer = CountVectorizer(max_features=config.MAX_FEATURES)
import numpy as np
X_clean_vec = vectorizer.fit_transform([X_clean])
print(vectorizer.get_feature_names_out())
np.pad(X_clean_vec,(0,3), mode="constant")
print(X_clean_vec.shape)

stupid dynam graph program solv trick cannot probelm
['cannot' 'dynam' 'graph' 'probelm' 'program' 'solv' 'stupid' 'trick']
(1, 8)


[nltk_data] Downloading package stopwords to /home/jhojin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### TF-IDF

In [7]:
samples

['this is a stupid dynamic programming probelm that you cannot solve with graph tricks.',
 'can you solve me?']

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
# corpus = samples
vectorizer = TfidfVectorizer(max_features=10)
X = vectorizer.fit_transform(X_train.tolist())
feat = vectorizer.get_feature_names_out()
# print(X,feat)
from sklearn.linear_model import SGDClassifier
model = OneVsRestClassifier(SGDClassifier())
model.fit(X, y_train_onehot)

samples = ["this is a stupid dynamic programming probelm that you cannot solve with graph tricks.",
"can you solve me?"]
sample = samples[0]
vec_samples = vectorizer.fit_transform(samples)
y_pred = model.predict(vec_samples)
print(y_pred)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0]]


### Pipeline and Prediction

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsOneClassifier,OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier

mlb = MultiLabelBinarizer()
y_train_ml = mlb.fit_transform(y_train)
print(X_train_counts.shape, y_train_ml.shape)

clf = OneVsRestClassifier(MultinomialNB()).fit(X_train_counts, y_train_ml)
print(clf)

# ValueError: X has 8 features, 
# but MultinomialNB is expecting 200 features as input.
# print(X_clean_vec.reshape((X_clean_vec.shape[0],config.MAX_FEATURES)))

XXXX = np.resize(X_clean_vec.toarray(),(X_clean_vec.shape[0],config.MAX_FEATURES))
y_pred = clf.predict(XXXX)
print(XXXX.shape,y_pred.shape)
print(y_pred,tags_as_str(y_pred[0]))

(6272, 200) (6272, 37)
OneVsRestClassifier(estimator=MultinomialNB())
(1, 200) (1, 37)
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  0]] ['schedules']


In [10]:
pipeline = Pipeline([
    ("count_vec",CountVectorizer(max_features=config.MAX_FEATURES)),
    # ("mlb",MultiLabelBinarizer(sparse_output=True)), #cannot transform y in pipeline
    ("multi_nb",OneVsRestClassifier(MultinomialNB())),
])
pipeline.fit(X_train,y_train_ml)
pickle.dump((pipeline,tags), open(config.MODEL, 'wb'))

In [11]:
from sklearn.metrics import accuracy_score
# X_test_counts = count_vect.transform(X_test)
y_test_onehot = mlb.fit_transform(y_test)
y_pred = pipeline.predict(X_test)
print(y_test_onehot.shape, y_pred.shape)
accuracy_score(y_test_onehot, y_pred)

(1568, 37) (1568, 37)


0.051658163265306124

In [12]:
samples = [
# # 1746A constructive, greedy
#     """You have an array a of size n consisting only of zeroes and ones and an integer k. In one operation you can do one of the following:

# Select 2 consecutive elements of a and replace them with their minimum (that is, let $$$a:=[a1,a2,…,ai−1,min(ai,ai+1),ai+2,…,an]$$$ for some $$$1≤i≤n−1$$$). This operation decreases the size of a by 1.
# Select k consecutive elements of a and replace them with their maximum (that is, let $$$a:=[a1,a2,…,ai−1,max(ai,ai+1,…,ai+k−1),ai+k,…,an]$$$ for some $$$1≤i≤n−k+1$$$). This operation decreases the size of a by k−1.
# Determine if it's possible to turn a into $$$[1]$$$ after several (possibly zero) operations.""", 
# # 1742A implementation
# """You are given three integers a, b, and c. Determine if one of them is the sum of the other two.""" ,
# # same problem with input, output directions
# """You are given three integers a, b, and c. Determine if one of them is the sum of the other two.

# The first line contains a single integer t $$$(1≤t≤9261)$$$ — the number of test cases.

# The description of each test case consists of three integers a, b, c $$$(0≤a,b,c≤20)$$$.

# For each test case, output "YES" if one of the numbers is the sum of the other two, and "NO" otherwise.

# You can output the answer in any case (for example, the strings "yEs", "yes", "Yes" and "YES" will be recognized as a positive answer)."""
"""
Alice and Bob are playing a game. They have an array of positive integers a of size n.

Before starting the game, Alice chooses an integer k≥0. The game lasts for k stages, the stages are numbered from 1 to k. During the i-th stage, Alice must remove an element from the array that is less than or equal to k−i+1. After that, if the array is not empty, Bob must add k−i+1 to an arbitrary element of the array. Note that both Alice's move and Bob's move are two parts of the same stage of the game. If Alice can't delete an element during some stage, she loses. If the k-th stage ends and Alice hasn't lost yet, she wins.

Your task is to determine the maximum value of k such that Alice can win if both players play optimally. Bob plays against Alice, so he tries to make her lose the game, if it's possible.

Input
The first line contains a single integer t (1≤t≤100) — the number of test cases.

The first line of each test case contains a single integer n (1≤n≤100) — the size of the array a.

The second line contains n integers a1,a2,…,an (1≤ai≤n).

Output
For each test case, print one integer — the maximum value of k such that Alice can win if both players play optimally.
"""
] 
for i in range(len(samples)):
    samples[i] = preprocess_text(samples[i])

y_pred = pipeline.predict(samples)
for y in y_pred:
    print(tags_as_str(y))

['games', 'greedy', 'math']
