# DataLab Cup 1
[Kaggle](https://www.kaggle.com/competitions/2023-datalab-cup1-predicting-news-popularity/data)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from pylab import *
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelBinarizer
from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression, Perceptron, SGDClassifier
from sklearn.feature_selection import RFE, SelectKBest
from sklearn.metrics import mean_squared_error as MSE, r2_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings("ignore")

from bs4 import BeautifulSoup as BSoup

nltk.download('stopwords')
stopWords = stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
train_R = pd.read_csv('/content/drive/MyDrive/deep learn/comp1/train_new2.csv')
test_R = pd.read_csv('/content/drive/MyDrive/deep learn/comp1/test_new2.csv')

In [None]:
print(train_R.shape)
print(test_R.shape)
print(train_R.columns)
print(test_R.columns)

(27643, 8)
(11847, 7)
Index(['Id', 'Popularity', 'Page content', 'Days', 'Time', 'Title', 'Content',
       'N_word'],
      dtype='object')
Index(['Id', 'Page content', 'Days', 'Time', 'Title', 'Content', 'N_word'], dtype='object')


In [None]:
days_conv = {'void': 0, 'Mon': 1, 'Tue': 2, 'Wed':3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7}

train_R['Days'] = train_R['Days'].apply(lambda x: days_conv[x])
test_R['Days'] = test_R['Days'].apply(lambda x: days_conv[x])

In [None]:
print(test_R.iloc[500, 1])

<html><head><div class="article-info"><span class="byline "><a href="/author/lance-ulanoff/"><img alt="2016%2f09%2f16%2fa2%2fhttpsd2mhye01h4nj2n.cloudfront.netmediazgkymde1lza0.a9a3a" class="author_image" src="http://i.amz.mshcdn.com/ziOBHf-ziCge9FhlLPoBk_xbCS8=/90x90/2016%2F09%2F16%2Fa2%2Fhttpsd2mhye01h4nj2n.cloudfront.netmediaZgkyMDE1LzA0.a9a3a.jpg"/></a><span class="author_name">By <a href="/author/lance-ulanoff/">Lance Ulanoff</a></span><time datetime="Wed, 10 Apr 2013 22:29:51 +0000">2013-04-10 22:29:51 UTC</time></span></div></head><body><h1 class="title">Lagoa Puts Pro-Level 3D Image Tools in the Cloud</h1><figure class="article-image"><img class="microcontent" data-fragment="lead-image" data-image="http://i.amz.mshcdn.com/zmFWH78iUvGLn8CsDepdVT-ezK8=/950x534/2013%2F07%2F08%2F18%2FMarofa_15.80912.jpg" data-micro="1" data-url="null" src="http://i.amz.mshcdn.com/zmFWH78iUvGLn8CsDepdVT-ezK8=/950x534/2013%2F07%2F08%2F18%2FMarofa_15.80912.jpg"/></figure><article data-channel="tech"><

In [None]:
soup = BSoup(test_R.iloc[1585, 1], 'html.parser')

# Find all HTML tags and get their attributes without content
tags_with_attributes = [tag for tag in soup.find_all()]

# Print the list of tags with attributes (excluding content)
for tag in tags_with_attributes:
    # Extract the tag name and its attributes
    tag_name = tag.name
    tag_attributes = tag.attrs
    print(f"<{tag_name} {tag_attributes}>")

In [6]:
# html tag V
# emoticons V
# date : 2013-06-19 V
# time: 15:04:30 UTC => to seconds V
# topic: Topics: Asteroid, Asteroids, challenge, Earth, Space, U.S., World
# lengths V
# token lengths
# weekdays + weekends V
# standardScaling
# topics - occurrences

def extract_day_time(text):
    text = BSoup(text, 'html.parser')

    tag = text.find(name='time')
    day = 'void' # published day
    time = 0 # time in seconds

    try:
        attr = tag.attrs['datetime']
    except:
        attr = None
        print("Date Not Found")


    if(attr):
        day = attr[:3]
        if(not day):
            day = 'NoData'

        time_str = re.findall('\d\d[:]\d\d[:]\d\d', attr)[0]
        if (not time_str):
            time = 0
        else:
            hour = int(time_str[0] + time_str[1])
            minute = int(time_str[3] + time_str[4])
            second = int(time_str[6] + time_str[7])
            time = hour*3600 + minute*60 + second
    return day, time

def extract_title_content(text):
    text = BSoup(text, 'html.parser')

    title = text.find('h1', attrs={'class': 'title'}).get_text()
    content = text.find('section', attrs={'class': 'article-content'}).get_text()

    # print(title)
    # print(content)

    return title, content

def preprocess(text):
    text = BSoup(text, 'html.parser').get_text()

    reg = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emo = re.findall(reg, text)
    text = re.sub(reg, '', text)

    text = re.sub('[\W]+', " ", text.lower()) + " " + ' '.join(emo).replace('-', '')
    return text

def tokenizer(text):
    return re.split('\s+', text.strip())

def tokenizer_stem_noStopWords(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip())
            if w not in stopWords and re.match('[a-zA-Z]+', w)]

In [None]:
day_of_week = []
time_in_sec = []
article_title = []
article_content = []

def create_more_feats(data):
    day_of_week_local = []
    time_in_sec_local = []
    text_title = []
    text_content = []

    for i in range(0, len(data['Page content'])):
        text = data['Page content'][i]

        day, time = extract_day_time(text)
        day_of_week_local.append(day)
        time_in_sec_local.append(time)

        title, content = extract_title_content(text)
        text_title.append(title)
        text_content.append(content)
    return day_of_week_local, time_in_sec_local, text_title, text_content

day_of_week, time_in_sec, article_title, article_content = create_more_feats(train_R)

train_R["Days"] = day_of_week
train_R["Time"] = time_in_sec
train_R["Title"] = article_title
train_R["Content"] = article_content

day_of_week, time_in_sec, article_title, article_content = create_more_feats(test_R)

test_R["Days"] = day_of_week
test_R["Time"] = time_in_sec
test_R["Title"] = article_title
test_R["Content"] = article_content

In [None]:
train_R.to_csv("train_new2.csv", index=False)
test_R.to_csv("test_new2.csv", index=False)

- training and validation data splitting

In [None]:
print(train_R.shape, valid_R.shape)
print(test_R.shape)
# print(train_R.head())
# print(test_R.head())
# print(train_R[:5])

(20732, 8) (6911, 8)
(11847, 7)


In [None]:
class CharacterCounter(BaseEstimator, TransformerMixin):
    """Count the number of characters in a document."""
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        n_characters = X.str.len()
        # print('567', (n_characters), '2636',  X.str)
        return n_characters.values.reshape(-1,1) # 2D array

class WordCounter(BaseEstimator, TransformerMixin):
    """Count the number of words in a document."""
    def _init_(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        n_word = X.apply(lambda text: len(text.split()))
        # print((n_word))
        return n_word.values.reshape(-1,1)

In [None]:
word_count = WordCounter()
word_count.fit(train_R['Content'])

# train_R['N_word'] = word_count.transform(train_R['Content'])

# print(train_R.head())

word_count = WordCounter()
word_count.fit(test_R['Content'])

# test_R['N_word'] = word_count.transform(test_R['Content'])

# print(test_R.head())

In [None]:

print(tokenizer(preprocess(train_R.iloc[0, 2])))

['clara', 'moskowitz', 'for', 'space', 'com', '2013', '06', '19', '15', '04', '30', 'utc', 'nasa', 's', 'grand', 'challenge', 'stop', 'asteroids', 'from', 'destroying', 'earth', 'there', 'may', 'be', 'killer', 'asteroids', 'headed', 'for', 'earth', 'and', 'nasa', 'has', 'decided', 'to', 'do', 'something', 'about', 'it', 'the', 'space', 'agency', 'announced', 'a', 'new', 'grand', 'challenge', 'on', 'june', '18', 'to', 'find', 'all', 'dangerous', 'space', 'rocks', 'and', 'figure', 'out', 'how', 'to', 'stop', 'them', 'from', 'destroying', 'our', 'planet', 'the', 'new', 'mission', 'builds', 'on', 'projects', 'already', 'underway', 'at', 'nasa', 'including', 'a', 'plan', 'to', 'capture', 'an', 'asteroid', 'pull', 'it', 'in', 'toward', 'the', 'moon', 'and', 'send', 'astronauts', 'to', 'visit', 'it', 'as', 'part', 'of', 'the', 'grand', 'challenge', 'the', 'agency', 'issued', 'a', 'request', 'for', 'information', 'today', 'aiming', 'to', 'solicit', 'ideas', 'from', 'industry', 'academia', 'and

In [None]:
a = train_R.iloc[5]
print(BSoup(a['Page content'], 'html.parser').get_text())
print(f'Popularity: {a["Popularity"]}')
# Clara Moskowitz for Space.com 2013-06-19 15:04:30 UTC NASA's Grand Challenge: Stop Asteroids From Destroying Earth There may be killer asteroids headed for Earth,
# and NASA has decided to do something about it. The space agency announced a new "Grand Challenge" on June 18 to find all dangerous space rocks and figure out how
# to stop them from destroying our planet. The new mission builds on projects already underway at NASA, including a plan to capture an asteroid, pull it in toward the
#  moon and send astronauts to visit it. As part of the Grand Challenge, the agency issued a "request for information" today — aiming to solicit ideas from industry,
#   academia and the public on how to improve the asteroid mission plan. "We're asking for you to think about concepts and different approaches for what we've described
#    here," William Gerstenmaier, NASA's associate administrator for human explorations and operations, said yesterday during a NASA event announcing the initiative. "We want
#     you to think about other ways of enhancing this to get the most out of it." SEE ALSO: How It Works: NASA Asteroid-CaptureResponses to the request for information,
#     which also seeks ideas for detecting and mitigating asteroid threats, are due July 18.The asteroid-retrieval mission, designed to provide the first
#     deep-space mission for astronauts flying on NASA's Space Launch System rocket and Orion space capsule under development, has come under fire from
#     lawmakers who would prefer that NASA return to the moon.A draft NASA authorization bill from the House space subcommittee, which is currently in debate,
#     would cancel the mission and steer the agency toward other projects. That bill will be discussed during a hearing
#     Wednesday, June 19 at 10 a.m. EDT.SEE ALSO: How It Works: NASA Asteroid-Capture Mission in PicturesBut NASA officials defended the asteroid mission
#     today and said they were confident they'd win Congress' support once they explained its benefits further."I think that we really, truly are going to be able to
#     show the value of the mission," NASA Associate Administrator Lori Garver said today. "To me, this is something that what we do in this country — we debate how
#     we spend the public's money. This is the beginning of the debate."Garver also maintained that sending astronauts to an asteroid would not diminish NASA's other
#     science and exploration goals, including another lunar landing.SEE ALSO: Animation Of Proposed Asteroid Retrieval Mission"This initiative takes nothing from the other
#     valuable work," she said. "This is only a small piece of our overall strategy, but it is an integral piece. It takes nothing from the moon."Part of NASA's plan to
#     win support for the flight is to link it more closely with the larger goal of protecting Earth from asteroid threats.If, someday, humanity discovers an asteroid
#     headed for Earth and manages to alter its course, "it will be one of the most important accomplishments in human history," said Tom Kalil, deputy director for
#     technology and innovation at the White House Office of Science and Technology Policy.SEE ALSO: Wildest Private Deep-Space Mission Ideas: A CountdownThe topic
#     of asteroid threats is more timely than ever, after a meteor exploded over the Russian city of Chelyabinsk on Feb. 15 — the same day that the football field-sized
#     asteroid 2012 DA14 passed within the moon's orbit of Earth.Image courtesy of NASA  SpaceX's Musk Says Sabotage Unlikely Cause of Sept. 1 Explosion, But Still a
#     Worry Proxima Centauri Is Like Our Sun... on Steroids China Launches Shenzhou-11 Astronauts to Tiangong-2 Space Lab Space Station Mockup In Houston -
#     Astronaut Guided Tour | Video   This article originally published at Space.com here
# Topics: Asteroid, Asteroids, challenge, Earth, Space, U.S., World

# By Christina Warren2013-03-28 17:40:55 UTCGoogle's New Open Source Patent Pledge: We Won't Sue Unless Attacked First Google took a stand of sorts against patent-lawsuit
# theater Thursday with its new Open Patent Non-Assertion (OPN) Pledge.  As explained by Google's Duane Valz, under the OPN Pledge, Google promises "not to sue any
# user, distributor or developer of open-source software on specified patents, unless first attacked." Now, Google isn't making all of its patents available for others.
# Instead, its starting small with 10 patents focused on MapReduce, a programming model for handling large data sets. There are already open-sourced versions of MapReduce
# available — including Hadoop — that are widely used across the Internet. Google says that over time, it plans to extend the OPN Pledge to more Google patents. SEE ALSO:
# 10 Interesting Google Glass Winners and Their Wacky Ideas  The big caveat here is that Google is pledging not to sue anyone who uses its MapReduce patents for Free or
# Open Source Software. Google is defining Free or Open Source software as any software that meets the Open Source Initiative's "Open Source Definition," as well as any
# version of the Free Software Foundation's "Free Software Definition." Still, Google iterates that the OPN Pledge isn't limited to a specific project or open-source
# copyright — as long as the project meets the FSF or OSI's definition for Free Software or Open Source Software, it's protected by the OPN Pledge. Google hopes that
# its OPN Pledge can be a model for other companies who are looking at how to "put their own patents into the service of open-source software." In the short-term,
# we're not sure what this does — except indemnify the open source MapReduce projects from a Google lawsuit. The bigger picture, however, is that this could be a
# new model for the way that patents are applied to open source as a whole. And that's a good thing. Image courtesy of iStockphoto, stuartbur
# Topics: Apps and Software, Google, open source, opn pledge, patent lawsuit theater, software patents, Tech, U.S.
# 1

#  Brendan Greeley for Bloomberg 2013-11-21 18:00:42 UTC The Underdog Internet Providers Head to Washington “Kind of like a union?” I ask Craig Foster.
#  “Eh, yeah,” he says, “except without the union part.” Foster, the CFO of Ubiquiti Networks, was in New York this week, in part to talk about the
#  Ubiquiti World Network, a trade group that will bring together smaller wireless Internet service providers and give them what they really need: a
#  lobbyist. Ubiquiti, whose chief executive officer, Robert Pera, was profiled in Bloomberg Businessweek earlier this year, makes hardware that sends Internet
#  signals over long distances without wires. Your phone carrier does this, but Ubiquiti’s customers, unlike your phone carrier, use unlicensed spectrum.
#  That is, they don’t have an exclusive license from the Federal Communications Commission to transmit. Instead, they send their signals over certain,
#  limited frequencies that have been set aside for anyone to use. Ubiquiti has focused its engineering talent on strengthening signals, sending them through
#  cluttered airwaves, and filtering signal from noise at the other end. This has created for them a large customer base in emerging markets, where there’s less
#  capital to spare for expensive fiber-optic networks. It’s a market large enough that Pera has been able to buy the Memphis Grizzlies. But there are also places
#  in America, mostly rural, where wired infrastructure is too expensive to build. And there are places in America, mostly urban, where wired Internet access is
#  too expensive to pay for. SEE ALSO: How Important Is Spectrum in Your Life?  Foster estimates that between 3,500 and 5,000 wireless Internet service providers
#  serve these markets. I went to a meeting of them last year; WISPs, as they call themselves, tend to be small, run by engineers who get a kick out of solving
#  problems while hanging by a clip from the top of a tower. It’s an industry still small enough that it calls its annual national conference “WISPapalooza.” The
#  companies that use licensed spectrum — like your cellphone carrier — have pull in Washington, both directly and through trade groups like the CTIA. They need to,
#  because they can’t operate without the licenses that come from the FCC. The WISPs, thus far, have no pull, because all they’ve needed from Washington is unlicensed
#  spectrum, a public good that Washington already provides. It’s unfortunate but true that members of Congress, if they don’t have anyone calling them about an issue,
#  assume that it’s unimportant and move on to the stuff that everyone’s yelling about. Which means that public goods don’t do well in Washington unless they happen to
#  line up with some large company’s commercial interests. Net neutrality, like unlicensed spectrum, lowers the barrier to entry for new companies, and has found (somewhat)
#  committed proponents as Google and Facebook increase the amounts of money they send back East. Earlier this year, Google cracked the top 10 in Washington in terms of
#  dollars spent on lobbying. Facebook more than tripled its lobbying spend last year as well. (Although, these companies are only as benevolent as their own interests
#  will allow them to be. On net neutrality, they line up on the same side as consumer groups. On privacy, they square off against them.) SEE ALSO: The New Lobbyists: Tech
#  Takes Washington  This is the challenge that Ubiquiti, with its new network of WISPs, has to overcome. There are a lot of people in Washington who are paid to demand
#  that the FCC free up more spectrum and license it to mobile phone carriers. They’ve been yelling about it for a while. So there’s little understanding in Congress
#  that unlicensed spectrum can generate economic activity, too, even though no one owns exclusive access to it. In February, Yochai Benkler, a fellow at Harvard’s
#  Berkman Center for Internet and Society, made a case for the potential of this market, pointing out that 87% of health-care wireless applications, for example,
#  use unlicensed spectrum. Regulation hasn’t caught up with technology, Benkler argued. The FCC shouldn’t just be protecting unlicensed bands, but expanding them.
#  Benkler’s academic ideas line up nicely with Ubiquiti’s financial interests, which gives the company a chance to champion what to Washington is still a new idea. I
#  ask Foster what bands, in particular, he’s interested in lobbying the FCC to free up for unlicensed use. “We’re going to push for anything we can get our hands
#  on” is his answer. The Ubiquiti World Network is going to hire a lobbyist, he says. “We’re a little bit green, but we have committed some money.” It will take a
#  lot more than some. SEE ALSO: Steve Case's Second Life as D.C.'s Favorite Businessman  Image: Flickr, Trevor McGoldrick  Options Insight: VanEck Vectors Gold Miners
#  ETF  How Concerned Are Investors Over Bank Earnings? Will Yellen Address Fed Rate Hike in Speech? Is U.K. Prime Minister Theresa May's Honeymoon Over?   This
#  article originally published at Bloomberg here
#  Topics: government, internet, internet service provider, lobbying, lobbyists, Mobile, Politics, Startups, U.S., World
# -1


 Brendan Greeley for Bloomberg 2013-11-21 18:00:42 UTC The Underdog Internet Providers Head to Washington “Kind of like a union?” I ask Craig Foster. “Eh, yeah,” he says, “except without the union part.” Foster, the CFO of Ubiquiti Networks, was in New York this week, in part to talk about the Ubiquiti World Network, a trade group that will bring together smaller wireless Internet service providers and give them what they really need: a lobbyist. Ubiquiti, whose chief executive officer, Robert Pera, was profiled in Bloomberg Businessweek earlier this year, makes hardware that sends Internet signals over long distances without wires. Your phone carrier does this, but Ubiquiti’s customers, unlike your phone carrier, use unlicensed spectrum. That is, they don’t have an exclusive license from the Federal Communications Commission to transmit. Instead, they send their signals over certain, limited frequencies that have been set aside for anyone to use. Ubiquiti has focused its engineering t

In [None]:
# print(train0[:5])

In [None]:
Z_train = train_R.iloc[:50]
Z_valid = valid_R.iloc[:50]
Z_test = test_R.iloc[:50]

In [7]:
train_R, valid_R = train_test_split(train_R, test_size=0.25, random_state=100)

Index(['Id', 'Popularity', 'Page content', 'Days', 'Time', 'Title', 'Content',
       'N_word'],
      dtype='object') \
Index(['Id', 'Page content', 'Days', 'Time', 'Title', 'Content', 'N_word'], dtype='object')

In [8]:
train_numeric = train_R[['Days', 'Time', 'N_word']]
train_alphabetic = train_R[['Title', 'Content']].to_numpy()
y_train = train_R['Popularity'].to_numpy()

valid_numeric = valid_R[['Days', 'Time', 'N_word']]
valid_alphabetic = valid_R[['Title', 'Content']].to_numpy()
y_valid = valid_R['Popularity'].to_numpy()

test_numeric = test_R[['Days', 'Time', 'N_word']]
test_alphabetic = test_R[['Title', 'Content']].to_numpy()

In [10]:
print(train_alphabetic.shape)

(20732, 2)


In [9]:
std = StandardScaler()
std.fit(train_numeric)

train_numeric_std = std.transform(train_numeric)
valid_numeric_std = std.transform(valid_numeric)
test_numeric_std = std.transform(test_numeric)

In [None]:
# pd.concat([valid_alphabetic, valid_numeric], axis =1)
# np.concatenate((train_alphabetic.to_numpy(), train_numeric_std.to_numpy()), axis=1)[:5]

In [None]:
# train_alphabetic.head(5)

Unnamed: 0,Title,Content
26267,New Class of 'Easily Retrievable' Asteroids Di...,Asteroids that pass close to Earth have becom...
2246,History Buffs Discover Their Hidden Past in Ne...,A lot of history happened before you came in...
18932,Cat Stevens Announces First U.S. Concert Tour ...,NEW YORK — New Rock and Roll Hall of Fame mem...
6310,Inside MoMA's Classic Video Games Exhibit,"MoMA: ""Applied Design"" The Museum o..."
2375,10 countries with the most people living in mo...,"An estimated 35.8 million men, women and chil..."


In [None]:
tfidf_trans = TfidfVectorizer(ngram_range=(1,1),
                            preprocessor=preprocess,
                            tokenizer=tokenizer_stem_noStopWords)

tfidf_trans.fit(train_R["Page content"][:2])
pd.DataFrame(tfidf_trans.transform(train_R["Page content"].head()).toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,297,298,299,300,301,302,303,304,305,306
0,0.06488,0.03244,0.03244,0.03244,0.03244,0.03244,0.0,0.03244,0.12976,0.03244,...,0.0,0.0,0.023081,0.06488,0.0,0.03244,0.06488,0.12976,0.0,0.03244
1,0.0,0.0,0.0,0.0,0.0,0.0,0.08522,0.0,0.0,0.0,...,0.08522,0.08522,0.060635,0.0,0.08522,0.0,0.0,0.0,0.255661,0.0
2,0.0,0.0,0.0,0.0,0.150839,0.0,0.0,0.150839,0.075419,0.0,...,0.0,0.0,0.107323,0.075419,0.0,0.0,0.0,0.075419,0.226258,0.0
3,0.0,0.0,0.057255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057255,0.0
4,0.0,0.0,0.0,0.0,0.091841,0.0,0.0,0.0,0.091841,0.0,...,0.0,0.0,0.130692,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
tfidf_trans = TfidfVectorizer(ngram_range=(1,1),
                            preprocessor=preprocess,
                            tokenizer=tokenizer_stem_noStopWords)

hashvec0 = HashingVectorizer(n_features=2**10,
                            tokenizer=tokenizer_stem_noStopWords)
hashvec1 = HashingVectorizer(n_features=2**10,
                            tokenizer=tokenizer_stem_noStopWords)

hashvec0.fit(train_alphabetic[:, 0])
train_R_preprocessed0 = hashvec0.transform(train_alphabetic[:, 0]).toarray()


hashvec1.fit(train_alphabetic[:, 1])
train_R_preprocessed1 = hashvec1.transform(train_alphabetic[:, 1]).toarray()

# pipe1 = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,1),
#                         preprocessor=preprocess,
#                         tokenizer=tokenizer_stem_noStopWords)), ('clf', LogisticRegression(solver='liblinear'))])

# pipe2 = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,1),
#                         preprocessor=preprocess,
#                         tokenizer=tokenizer_stem_noStopWords))])

# text_preprocessor = ColumnTransformer([('tfidf1', TfidfVectorizer(ngram_range=(1,1),
#                                             preprocessor=preprocess,
#                                             tokenizer=tokenizer_stem_noStopWords), ['Title']),
#                                        ('tfidf2', TfidfVectorizer(ngram_range=(1,1),
#                                             preprocessor=preprocess,
#                                             tokenizer=tokenizer_stem_noStopWords), ['Content'])
#                                        ], remainder='passthrough')

# text_preprocessor.fit(train_alphabetic)
# temp = text_preprocessor.transform(train_alphabetic)

# print(pd.DataFrame(temp, columns=text_preprocessor.get_feature_names_out()))
# train_processed = np.concatenate((text_preprocessor.fit_transform(train_alphabetic), train_numeric_std.to_numpy()), axis=1)


# rf.fit(train_processed)

# scores = cross_val_score(estimator=pipe1, X=Z_train['Page content'], y=Z_train['Popularity'],
#                          cv=10, scoring='roc_auc')

# ['Days', 'Time', 'N_word']
# tfidf_trans.fit(train_R["Page content"].to_numpy())
# train_R_preprocessed = tfidf_trans.transform(train_R["Page content"]).toarray()
# train_R_preprocessed['Days'] = train_R['Days']
# train_R_preprocessed['Time'] = train_R['Time']
# train_R_preprocessed['N_word'] = train_R['N_word']
# train_R_preprocessed = train_R_preprocessed.rename(str, axis="columns")

# valid_R_preprocessed = tfidf_trans.transform(valid_R["Page content"]).toarray()
# valid_R_preprocessed['Days'] = valid_R['Days']
# valid_R_preprocessed['Time'] = valid_R['Time']
# valid_R_preprocessed['N_word'] = valid_R['N_word']
# valid_R_preprocessed = valid_R_preprocessed.rename(str, axis="columns")

# # pipe2.fit(train_R['Page content'], train_R["Popularity"])
# rf.fit(train_R_preprocessed, train_R['Popularity'])
# scores1 = roc_auc_score(train_R["Popularity"], rf.predict_proba(train_R_preprocessed))
# scores2 = roc_auc_score(valid_R["Popularity"], rf.predict_proba(valid_R_preprocessed))

# # scores1 = pipe1.score(valid_R['Page content'], valid_R["Popularity"])
# # scores2 = pipe1.score(train_R['Page content'], train_R["Popularity"])

# # valid_R_pred = pipe1.predict(valid_R['Page content'])
# # print(valid_R_pred[:10], '---', valid_R.iloc[:10, 1])


# print(f"scores1: {scores1}")
# print(f"scores2: {scores2}")
# # print(scores.mean())
# # print(scores.std())

In [15]:
# train_R_preprocessed = tfidf_trans.transform(train_R["Page content"]).toarray()
# valid_R_preprocessed = tfidf_trans.transform(valid_R["Page content"]).toarray()


In [None]:
# train_R_preprocessed['Days'] = train_R['Days']
# train_R_preprocessed['Time'] = train_R['Time']
# train_R_preprocessed['N_word'] = train_R['N_word']

# valid_R_preprocessed['Days'] = valid_R['Days']
# valid_R_preprocessed['Time'] = valid_R['Time']
# valid_R_preprocessed['N_word'] = valid_R['N_word']

In [13]:
train_R_preprocessed0

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
valid_R_preprocessed0 = hashvec0.transform(valid_alphabetic[:, 0]).toarray()
valid_R_preprocessed1 = hashvec1.transform(valid_alphabetic[:, 1]).toarray()

In [14]:
train_R_preprocessed = np.concatenate((train_R_preprocessed0, train_R_preprocessed1, train_numeric_std), axis=1)
valid_R_preprocessed = np.concatenate((valid_R_preprocessed0, valid_R_preprocessed1, valid_numeric_std), axis=1)

In [None]:
# train_R_preprocessed = train_R_preprocessed.rename(str, axis="columns")
# valid_R_preprocessed = valid_R_preprocessed.rename(str, axis="columns")

# train_R_temp = train_R_preprocessed['0'][:].toarray()

In [17]:
print(valid_R_preprocessed)
print(valid_numeric_std)

[[ 0.          0.          0.         ...  0.89458274  1.16357761
  -0.97430135]
 [ 0.          0.          0.         ... -0.25476745  0.89387165
  -0.91454607]
 [ 0.          0.          0.         ...  1.46925783 -1.66145237
  -0.47191434]
 ...
 [ 0.          0.          0.         ...  0.89458274  1.39560809
   0.32924909]
 [ 0.          0.          0.         ...  2.04393293  0.3903672
   0.80729135]
 [ 0.          0.          0.         ...  0.31990764 -1.9248861
  -0.15764581]]
[[ 0.89458274  1.16357761 -0.97430135]
 [-0.25476745  0.89387165 -0.91454607]
 [ 1.46925783 -1.66145237 -0.47191434]
 ...
 [ 0.89458274  1.39560809  0.32924909]
 [ 2.04393293  0.3903672   0.80729135]
 [ 0.31990764 -1.9248861  -0.15764581]]


In [18]:
rf = RandomForestClassifier(max_depth=5)

rf.fit(train_R_preprocessed, y_train)
scores1 = roc_auc_score(y_train, rf.predict(train_R_preprocessed))
scores2 = roc_auc_score(y_valid, rf.predict(valid_R_preprocessed))

# scores1 = pipe1.score(valid_R['Page content'], valid_R["Popularity"])
# scores2 = pipe1.score(train_R['Page content'], train_R["Popularity"])

# valid_R_pred = pipe1.predict(valid_R['Page content'])
# print(valid_R_pred[:10], '---', valid_R.iloc[:10, 1])


print(f"scores1: {scores1}")
print(f"scores2: {scores2}")
# print(scores.mean())
# print(scores.std())

scores1: 0.6452339839189981
scores2: 0.5357681973896193


In [19]:
rf11 = RandomForestClassifier(max_depth=5)
rf22 = RandomForestClassifier(max_depth=5)
rf33 = RandomForestClassifier(max_depth=5)

# tfidf_trans1 = TfidfVectorizer(ngram_range=(1,1),
#                               tokenizer=tokenizer_stem_noStopWords)
# tfidf_trans2 = TfidfVectorizer(ngram_range=(1,1),
#                               tokenizer=tokenizer_stem_noStopWords)

# tfidf_trans1.fit(train_alphabetic[0])
# tfidf_trans2.fit(train_alphabetic[1])
# train_alphabetic1 = tfidf_trans1.transform(train_alphabetic[0]).toarray()
# train_alphabetic2 = tfidf_trans2.transform(train_alphabetic[1]).toarray()

# valid_alphabetic1 = tfidf_trans1.transform(valid_alphabetic[0]).toarray()
# valid_alphabetic2 = tfidf_trans2.transform(valid_alphabetic[1]).toarray()
# train_numeric_std

rf11.fit(train_R_preprocessed0, y_train)
rf22.fit(train_R_preprocessed1, y_train)
rf33.fit(train_numeric_std, y_train)

train_pred1 = rf11.predict(train_R_preprocessed0)
train_pred2 = rf22.predict(train_R_preprocessed1)
train_pred3 = rf33.predict(train_numeric_std)

valid_pred1 = rf11.predict(valid_R_preprocessed0)
valid_pred2 = rf22.predict(valid_R_preprocessed1)
valid_pred3 = rf33.predict(valid_numeric_std)


train_prediction = []
for i in range(train_pred1.shape[0]):
    pred = 0
    # np = 0
    pred += 1 if train_pred1[i]==1 else -1
    pred += 1 if train_pred2[i]==1 else -1
    pred += 1 if train_pred3[i]==1 else -1
    train_prediction.append(1 if pred > 0 else -1)

scores1 = roc_auc_score(y_train, train_prediction)
print(f"scores1: {scores1}")

valid_prediction = []
for i in range(valid_pred1.shape[0]):
    pred = 0
    # np = 0
    pred += 1 if valid_pred1[i]==1 else -1
    pred += 1 if valid_pred2[i]==1 else -1
    pred += 1 if valid_pred3[i]==1 else -1
    valid_prediction.append(1 if pred > 0 else -1)

scores2 = roc_auc_score(y_valid, valid_prediction)
print(f"scores2: {scores2}")


scores1: 0.5979363465153764
scores2: 0.5198769875479529


In [None]:
scores1 = pipe1.score(valid_R['Page content'], valid_R["Popularity"])
scores2 = pipe1.score(train_R['Page content'], train_R["Popularity"])

valid_R_pred = pipe1.predict(valid_R['Page content'])
print(valid_R_pred[:10], '---', valid_R.iloc[:10, 1])


print(f"scores1: {scores1}")
print(f"scores2: {scores2}")

[-1  1 -1 -1 -1 -1 -1 -1 -1  1] --- 8000   -1
8001   -1
8002   -1
8003    1
8004   -1
8005    1
8006   -1
8007   -1
8008   -1
8009   -1
Name: Popularity, dtype: int64
scores1: 0.526752532708853
scores2: 0.981


### Result

predict the testing data and save it as csv file

In [23]:
test_R_preprocessed0 = hashvec0.transform(test_alphabetic[:, 0]).toarray()
test_R_preprocessed1 = hashvec1.transform(test_alphabetic[:, 1]).toarray()

# test_R_preprocessed = np.concatenate((test_R_preprocessed0, test_R_preprocessed1, test_numeric_std), axis=1)

In [26]:
type(test_numeric_std)

numpy.ndarray

In [29]:
test_R_preprocessed = np.concatenate((test_R_preprocessed0, test_R_preprocessed1, test_numeric_std), axis=1)

In [37]:
test_R_pred = rf.predict_proba(test_R_preprocessed)

In [None]:
test_R_pred = pipe1.predict(test_R['Page content'])


In [40]:
print(test_R_pred[:10])

[[0.5034019  0.4965981 ]
 [0.50125111 0.49874889]
 [0.52102301 0.47897699]
 [0.51003137 0.48996863]
 [0.50929964 0.49070036]
 [0.51109247 0.48890753]
 [0.48830523 0.51169477]
 [0.49570734 0.50429266]
 [0.48548335 0.51451665]
 [0.50843542 0.49156458]]


In [41]:
df = pd.DataFrame(test_R_pred[:, 1], columns=["Popularity"])
df["Id"] = test_R['Id']
df = df.iloc[:, [1, 0]]
df.head(10)

Unnamed: 0,Id,Popularity
0,27643,0.496598
1,27644,0.498749
2,27645,0.478977
3,27646,0.489969
4,27647,0.4907
5,27648,0.488908
6,27649,0.511695
7,27650,0.504293
8,27651,0.514517
9,27652,0.491565


In [42]:
# df = pd.DataFrame(data=test_R_pred.reshape(test_R_pred.shape[0], 1), columns=['Popularity'])
df.to_csv('./prediction4.csv', index=False)

In [None]:
def get_stream(data, size):
    for chunk in data
        yield chunk

In [None]:

hashvec = HashingVectorizer(n_features=2**20,
                            preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
# loss='log' gives logistic regression
clf = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
batch_size = 1000
stream = get_stream(path='./dataset/sentiment/train.csv', size=batch_size)
classes = np.array([0, 1])
train_auc, val_auc = [], []
# we use one batch for training and another for validation in each iteration
iters = int((25000+batch_size-1)/(batch_size*2))
for i in range(iters):
    batch = next(stream)
    X_train, y_train = batch['review'], batch['sentiment']
    if X_train is None:
        break
    X_train = hashvec.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    train_auc.append(roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]))

    # validate
    batch = next(stream)
    X_val, y_val = batch['review'], batch['sentiment']
    score = roc_auc_score(y_val, clf.predict_proba(hashvec.transform(X_val))[:,1])
    val_auc.append(score)
    print('[{}/{}] {}'.format((i+1)*(batch_size*2), 25000, score))

## History