In [1]:
import csv
import re
from collections import Counter

In [2]:
# Define input and output filenames
input_filename = "../data/sso_posts.csv"
output_filename = "../data/pytorch_posts.csv"
results_filename = "../results/pytorch_results.csv"

In [3]:
# Define regex pattern to match "pytorch" in title, body or tags
pattern = re.compile(r"pytorch", re.IGNORECASE)

# Initialize counters for number of posts and keywords
num_posts = 0
keyword_counter = Counter()

In [4]:

# Open input and output files
with open(input_filename, "r", encoding="utf-8") as input_file, \
        open(output_filename, "w", encoding="utf-8", newline="") as output_file:

    # Create CSV reader and writer objects
    input_reader = csv.DictReader(input_file)
    output_writer = csv.DictWriter(output_file, fieldnames=input_reader.fieldnames)

    # Write header row to output file
    output_writer.writeheader()

    # Loop through each row in input file
    for row in input_reader:

        # Check if row contains "pytorch" in title, body or tags
        if any(pattern.search(row[field]) for field in ["Title", "Body", "Tags"]):

            # Increment post count
            num_posts += 1

            # Extract keywords from tags field and update keyword counter
            tags = row["Tags"].strip("<>").split("><")
            keywords = [tag for tag in tags if pattern.search(tag)]
            keyword_counter.update(keywords)

            # Write row to output file
            output_writer.writerow(row)

In [5]:

# Sort keyword counter by count in descending order
sorted_keywords = sorted(keyword_counter.items(), key=lambda x: x[1], reverse=True)

In [6]:
# Open results file and write number of posts and top 10 keywords
with open(results_filename, "w", encoding="utf-8", newline="") as results_file:
    results_writer = csv.writer(results_file)
    results_writer.writerow(["Number of posts:", num_posts])
    results_writer.writerow(["Top keywords:"])
    for keyword, count in sorted_keywords[:10]:
        results_writer.writerow([keyword, count])

In [7]:
import pandas as pd

In [11]:
# Load and preprocess the data
data = pd.read_csv('../data/cleaned_pytorch_posts.csv')
data.sample(10)

Unnamed: 0,Id,PostTypeId,ParentId,Title,Body,Score,Tags,ViewCount,FavoriteCount
6643,61451339,1,,PyTorch LSTM crashing on colab gpu works fine ...,Hello I have following LSTM which runs fine on...,1,pytorch gpu lstm google-colaboratory,594.0,
16705,71083080,1,,registering a hook with pytorch does not chang...,I m trying to remove nans infs from a gradient...,1,python pytorch gradient nan,120.0,
6546,61318213,1,,From Coco annotation json to semantic segmenta...,I am trying to use COCO 2014 data for semantic...,1,pytorch semantic-segmentation coco,3389.0,
11027,66051641,1,,How to create a submodel from a pretrained mod...,So I have been working on neural style transfe...,1,machine-learning pytorch pre-trained-model sty...,1148.0,
11911,66921943,1,,PyTorch Tensor Operation for adding the maximu...,Follow Up question to Basically adding the max...,1,python pytorch dynamic-programming tensor,115.0,
12967,67869267,1,,Subset object is not an iterator for updating ...,I m updating a pytorch network from legacy cod...,1,pytorch sentiment-analysis imdb,265.0,
9072,63991646,1,,How to find the mean and the covariance of a 2...,I have a tensor of shape h w which consists of...,0,pytorch mean distribution covariance,111.0,
5277,59581762,1,,install import Pytorch on mac osx python 3 7,I have torch all installed in a virtual env bu...,2,pip pytorch python-3.7 parlai,221.0,
2835,55648989,1,,How to make pycharm use a different cuda toolkit,I want to run an MXNet module in GPU I have a ...,0,python pycharm mxnet,1162.0,
13623,68425009,1,,One Hot Encoding in Pytorch for two classes,I have a tensor of 0 and 1 which I calculated ...,0,python pytorch one-hot-encoding,800.0,
