In [1]:
#import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Load parquet files into DataFrames
df = pd.read_csv('/content/drive/My Drive/ilab dataset/real_final_df.csv')

In [3]:
print(df.columns)

Index(['lens_id', 'date_published', 'kind', 'text', 'claim_text',
       'applicant_name', 'invention_title', 'inventor_name', 'cited_lens_id',
       'cited_jurisdiction', 'cited_kind', 'symbol', 'grant_date', 'granted',
       'patent_status', 'cited_phase', 'nplcit_lens_id', 'nplcit_text',
       'patcit_jurisdiction', 'patcit_kind', 'main_cpc', 'sub_cpc',
       'main_cpc_description', 'sub_cpc_description', 'jurisdiction',
       'truncated_text', 'sentiment', 'categories', 'Time Horizon',
       'Opportunity-Challenge', 'Likelihood', 'Degree of Change',
       'Scale of Impact', 'summary'],
      dtype='object')


In [4]:
# Check for missing values in each column
missing_columns = df.columns[df.isna().any()].tolist()

# Print the columns with missing values
print("Columns with missing values:", missing_columns)

Columns with missing values: ['inventor_name', 'cited_lens_id', 'cited_jurisdiction', 'cited_kind', 'symbol', 'grant_date', 'granted', 'cited_phase', 'nplcit_lens_id', 'nplcit_text', 'patcit_jurisdiction', 'patcit_kind', 'main_cpc', 'sub_cpc', 'main_cpc_description', 'sub_cpc_description', 'summary']


In [5]:
# List of text columns with missing values
text_columns = df.columns[df.dtypes == 'object']  # Select all text columns

# Fill missing values in text columns with a placeholder
for col in text_columns:
    df[col].fillna('missing', inplace=True)

In [6]:
df['granted'] = df['granted'].astype(str)

In [7]:
# List of column names to keep
columns_to_keep = ["lens_id",
                   "date_published",
                   "kind",
                   "grant_date",
                   "text",
                   "claim_text",
                   "applicant_name",
                   "invention_title",
                   "summary",
                   "inventor_name",
                   "cited_lens_id",
                   "cited_jurisdiction",
                   "granted",
                   "main_cpc",
                   "sub_cpc",
                   "sentiment",
                   "categories"]
# Move the specified columns
#"description_text",
new_df = df[columns_to_keep].copy()

In [8]:
# Create a target variable
y = new_df['granted']
X = new_df.drop(columns='granted')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(len(y_train))
print(len(X_train))

7636
7636


In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = label_encoder.fit_transform(X_train[col])

In [15]:
for col in X_test.columns:
    if X_test[col].dtype == 'object':
        X_test[col] = label_encoder.fit_transform(X_test[col])

In [13]:
from sklearn.preprocessing import LabelEncoder
# Encode the 'granted' labels into numerical values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [14]:
# Create and train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = clf.predict(X_test)

In [17]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.61
