diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..17d4e87 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,27 @@ +name: Ruby + +on: + push: + branches: + - main + + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + name: Ruby ${{ matrix.ruby }} + strategy: + matrix: + ruby: + - '3.3.4' + + steps: + - uses: actions/checkout@v4 + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby }} + bundler-cache: true + - name: Run the default task + run: bundle exec rake diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9106b2a --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/.bundle/ +/.yardoc +/_yardoc/ +/coverage/ +/doc/ +/pkg/ +/spec/reports/ +/tmp/ diff --git a/.standard.yml b/.standard.yml new file mode 100644 index 0000000..6d67c99 --- /dev/null +++ b/.standard.yml @@ -0,0 +1,3 @@ +# For available configuration options, see: +# https://github.com/standardrb/standard +ruby_version: 3.0 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..4161d30 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +# Changelog + +## 1.0.0 + +- Initial release. diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..ebd35d1 --- /dev/null +++ b/Gemfile @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +# Specify your gem's dependencies in names_dataset.gemspec +gemspec + +gem "rake", "~> 13.0" + +gem "minitest", "~> 5.16" + +gem "standard", "~> 1.3" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..13e90b6 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,68 @@ +PATH + remote: . + specs: + names_dataset (1.0.0) + iso_country_codes (~> 0.7.6) + rubyzip (~> 2.3) + +GEM + remote: https://rubygems.org/ + specs: + ast (2.4.2) + iso_country_codes (0.7.8) + json (2.9.1) + language_server-protocol (3.17.0.3) + lint_roller (1.1.0) + minitest (5.25.4) + parallel (1.26.3) + parser (3.3.6.0) + ast (~> 2.4.1) + racc + racc (1.8.1) + rainbow (3.1.1) + rake (13.2.1) + regexp_parser (2.10.0) + rubocop (1.69.2) + json (~> 2.3) + language_server-protocol (>= 3.17.0) + parallel (~> 1.10) + parser (>= 3.3.0.2) + rainbow (>= 2.2.2, < 4.0) + regexp_parser (>= 2.9.3, < 3.0) + rubocop-ast (>= 1.36.2, < 2.0) + ruby-progressbar (~> 1.7) + unicode-display_width (>= 2.4.0, < 4.0) + rubocop-ast (1.37.0) + parser (>= 3.3.1.0) + rubocop-performance (1.23.0) + rubocop (>= 1.48.1, < 2.0) + rubocop-ast (>= 1.31.1, < 2.0) + ruby-progressbar (1.13.0) + rubyzip (2.3.2) + standard (1.43.0) + language_server-protocol (~> 3.17.0.2) + lint_roller (~> 1.0) + rubocop (~> 1.69.1) + standard-custom (~> 1.0.0) + standard-performance (~> 1.6) + standard-custom (1.0.2) + lint_roller (~> 1.0) + rubocop (~> 1.50) + standard-performance (1.6.0) + lint_roller (~> 1.1) + rubocop-performance (~> 1.23.0) + unicode-display_width (3.1.3) + unicode-emoji (~> 4.0, >= 4.0.4) + unicode-emoji (4.0.4) + +PLATFORMS + ruby + +DEPENDENCIES + minitest (~> 5.16) + names_dataset! + rake (~> 13.0) + standard (~> 1.3) + +BUNDLED WITH + 2.5.16 diff --git a/README.md b/README.md index 244e22a..94f4f6d 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,130 @@ # First and Last Names Dataset -A Ruby port of https://github.com/philipperemy/name-dataset +`NamesDataset` is a Ruby library (ported from the python [philipperemy/name-dataset](https://github.com/philipperemy/name-dataset) library) that provides fast lookups and metadata for first and last names. Ever wondered if “Zoe” is more likely a name from the United Kingdom or how popular “White” is as a last name in the United States? This library helps you answer those questions. + +`NamesDataset` can help you: +- Search for a first or last name and learn about: +- Probable country of origin +- Gender distribution (for first names) +- Rank/popularity +- Get lists of top names by country and gender. + +Under the hood, `NamesDataset` loads an in-memory dataset (derived from a Facebook leak of 533M users) that’s roughly 3.2GB once loaded into memory. Once loaded, it’s quick to search but definitely requires some hardware overhead, so keep that in mind if you’re planning on deploying this to production. + +## Requirements +- Ruby >= 2.7 (tested on 2.7, 3.0, 3.1, 3.2). +- Approximately 3.2GB of RAM available to load the full dataset. + +## Installation + +Add the gem to your Gemfile and run bundle. + +```ruby +gem "names_dataset" +``` + +Then require the library and initialize it in your application. + +```ruby +require "names_dataset" + +# The library takes time to initialize because the database is massive. +# A tip is to include its initialization in your app's startup process. +nd = NamesDataset.new +``` + +## Usage + +`NamesDataset` provides methods to query the dataset for information about first and last names. Here are some examples: + +```ruby +nd = NamesDataset.new + +p nd.search("Philippe") +# => { +# :first_name => { +# :country => { "France" => 0.63, "Belgium" => 0.12, ... }, +# :gender => { "Male" => 0.99, "Female" => 0.01 }, +# :rank => { "France" => 73, "Belgium" => 291, ... } +# }, +# :last_name => { +# :country => {}, +# :gender => {}, +# :rank => {} +# } +# } + +p nd.search("Zoe") +# => { +# :first_name => { +# :country => { "United Kingdom" => 0.52, "United States" => 0.23, ... }, +# :gender => { "Female" => 0.98, "Male" => 0.02 }, +# :rank => { "United Kingdom" => 140, "United States" => 315, ... } +# }, +# :last_name => { ... } +# } +``` + +The result is a Ruby Hash with the following structure: +- `:first_name`: Includes `:country`, `:gender`, `:rank` +- `:last_name`: Includes `:country`, `:gender` (generally empty for last names), and `:rank` + +### Memory Usage Disclaimer + +Because the library pre-loads the entire 3.2GB dataset into memory, you’ll need sufficient RAM to avoid NoMemoryError. If you only need a subset of the data or if memory is a major concern, consider alternative approaches (e.g., a streaming or database-based solution). But if you can spare the memory, NamesDataset is fast for repeated lookups once it’s loaded. + +### Top Names + +Similar to the Python library, you can fetch the most popular names by country or gender: + +```ruby +p nd.get_top_names(n: 10, gender: "Male", country_alpha2: "US") +# => { +# "US" => { +# "M" => ["Jose", "David", "Michael", "John", "Juan", ... ] +# } +# } + +p nd.get_top_names(n: 5, country_alpha2: "ES") +# => { +# "ES" => { +# "M" => ["Jose", "Antonio", "Juan", "Manuel", "David"], +# "F" => ["Maria", "Ana", "Carmen", "Laura", "Isabel"] +# } +# } +``` + +### Other Helpers + +```ruby +p nd.get_country_codes(alpha_2: true) +# => ["AE", "AF", "AL", "AO", "AR", "AT", ... ] + +nd.first_names +# => A Hash of first names mapped to their attributes (country, gender, rank, etc). + +nd.last_names +# => A Hash of last names mapped to their attributes (country, rank, etc). +``` + +## Full Dataset + +For offline or alternative usage, a link to the raw dataset can be found in the [original Python library](https://github.com/philipperemy/name-dataset/blob/6ae42a6a84a7b6460baa2cbd440f0cdf9fe81752/README.md#full-dataset). + +## Ports + +- This library is a port of the original Python library [philipperemy/name-dataset](https://github.com/philipperemy/name-dataset). + +## Contributing + +We welcome contributions! Feel free to open an issue or submit a pull request on GitHub. + +## License + +This library is subject to the same considerations as the Python version: +- The dataset is generated from a large-scale Facebook leak (533M accounts). +- Basic lists of names are [typically not copyrightable](https://github.com/philipperemy/name-dataset/blob/6ae42a6a84a7b6460baa2cbd440f0cdf9fe81752/README.md#license), but please consult a lawyer if you have specific legal concerns. +- You can find the full license from the original python library in [that project](https://github.com/philipperemy/name-dataset/blob/6ae42a6a84a7b6460baa2cbd440f0cdf9fe81752/LICENSE). +- You can find the full license for this Ruby port in the [LICENSE](LICENSE) file at the root of this repository. + +Thanks for checking out `names_dataset`! If this library helps you ship something neat, I’d love to know about it, feel free to open a Pull Request or Issue :heart: diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..f32601d --- /dev/null +++ b/Rakefile @@ -0,0 +1,10 @@ +# frozen_string_literal: true + +require "bundler/gem_tasks" +require "minitest/test_task" + +Minitest::TestTask.create + +require "standard/rake" + +task default: %i[test standard] diff --git a/bin/console b/bin/console new file mode 100755 index 0000000..2c52fbf --- /dev/null +++ b/bin/console @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "names_dataset" + +# You can add fixtures and/or initialization code here to make experimenting +# with your gem easier. You can also use a different console, if you like. + +require "irb" +IRB.start(__FILE__) diff --git a/bin/setup b/bin/setup new file mode 100755 index 0000000..dce67d8 --- /dev/null +++ b/bin/setup @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' +set -vx + +bundle install + +# Do any other automated setup that you need to do here diff --git a/data/first_names.zip b/data/first_names.zip new file mode 100644 index 0000000..4b65483 Binary files /dev/null and b/data/first_names.zip differ diff --git a/data/last_names.zip b/data/last_names.zip new file mode 100644 index 0000000..3b664c8 Binary files /dev/null and b/data/last_names.zip differ diff --git a/lib/names_dataset.rb b/lib/names_dataset.rb new file mode 100644 index 0000000..286b2c1 --- /dev/null +++ b/lib/names_dataset.rb @@ -0,0 +1,133 @@ +# frozen_string_literal: true + +require "iso_country_codes" +require "json" +require "zip" +require_relative "names_dataset/version" + +class NamesDataset + class Error < StandardError; end + + attr_reader :first_names, :last_names + + FIRST_NAMES_ZIP_PATH = File.expand_path("../../data/first_names.zip", __FILE__) + LAST_NAMES_ZIP_PATH = File.expand_path("../../data/last_names.zip", __FILE__) + + def initialize(first_names_path: FIRST_NAMES_ZIP_PATH, last_names_path: LAST_NAMES_ZIP_PATH) + @first_names = load_zipped_json(first_names_path) + @last_names = load_zipped_json(last_names_path) + end + + def search(name) + return empty_result if name.nil? || name.strip.empty? + + n = normalize(name) + first_name_data = post_process(@first_names[n]) if @first_names.key?(n) + last_name_data = post_process(@last_names[n]) if @last_names.key?(n) + + { + first_name: first_name_data || empty_name_metadata, + last_name: last_name_data || empty_name_metadata + } + end + + def get_country_codes(alpha_2: true) + dataset = @first_names || @last_names + country_codes = dataset.values.flat_map { |entry| entry["country"].keys }.uniq + alpha_2 ? country_codes : country_codes.map { |code| IsoCountryCodes.find(code).name }.compact + end + + def get_top_names(n: 10, gender: nil, country_alpha2: nil) + raise ArgumentError, "n must be positive" if n <= 0 + + dataset = @first_names + raise Error, "No dataset loaded" if dataset.nil? + + ranks_per_country = Hash.new { |h, k| h[k] = Hash.new { |hh, kk| hh[kk] = [] } } + + dataset.each do |name, data| + next unless matches_gender?(data, gender) + + data["rank"].each do |country, rank| + next if country_alpha2 && country != country_alpha2 + + gender_label = determine_gender(data["gender"]) + ranks_per_country[country][gender_label] << [name, rank] + end + end + + ranks_per_country.each do |country, genders| + genders.each_key do |gender_label| + genders[gender_label] = genders[gender_label].sort_by(&:last).take(n).map(&:first) + end + end + + ranks_per_country + end + + def load_zipped_json(zip_path) + return {} unless File.exist?(zip_path) + + content = nil + Zip::File.open(zip_path) do |zip_file| + entry = zip_file.first + content = entry.get_input_stream.read if entry + end + content ? JSON.parse(content) : {} + rescue => e + warn "Failed to load or parse #{zip_path}: #{e.message}" + {} + end + + def normalize(str) + str.strip.capitalize + end + + def post_process(data) + return nil unless data + + { + "country" => map_country_codes(data["country"]), + "gender" => map_gender(data["gender"]), + "rank" => map_country_codes(data["rank"]) + } + end + + def map_country_codes(data) + data.transform_keys do |alpha2| + IsoCountryCodes.find(alpha2).name + rescue IsoCountryCodes::UnknownCodeError + nil + end.compact + end + + def map_gender(data) + gender_map = {"M" => "Male", "F" => "Female"} + data.transform_keys { |key| gender_map[key] } + end + + def matches_gender?(data, gender) + return true unless gender + + gender_key = gender.downcase.start_with?("m") ? "M" : "F" + data["gender"].key?(gender_key) + end + + def determine_gender(gender_data) + return "N/A" if gender_data.empty? + + if gender_data.size == 1 + gender_data.keys.first + else + (gender_data["M"] > gender_data["F"]) ? "M" : "F" + end + end + + def empty_result + {first_name: empty_name_metadata, last_name: empty_name_metadata} + end + + def empty_name_metadata + {"country" => {}, "gender" => {}, "rank" => {}} + end +end diff --git a/lib/names_dataset/version.rb b/lib/names_dataset/version.rb new file mode 100644 index 0000000..4ac3688 --- /dev/null +++ b/lib/names_dataset/version.rb @@ -0,0 +1,3 @@ +class NamesDataset + VERSION = "1.0.0" +end diff --git a/names_dataset.gemspec b/names_dataset.gemspec new file mode 100644 index 0000000..76a436b --- /dev/null +++ b/names_dataset.gemspec @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +require_relative "lib/names_dataset/version" + +Gem::Specification.new do |spec| + spec.name = "names_dataset" + spec.version = NamesDataset::VERSION + spec.authors = ["Jonathan Hoyt"] + spec.email = ["jonmagic@gmail.com"] + + spec.summary = "The Ruby library for first and last names." + spec.description = "Use this Rubygem if you need a comprehensive list of first and last names and metadata." + spec.homepage = "https://github.com/jonmagic/names_dataset_ruby" + spec.required_ruby_version = ">= 3.0.0" + + spec.metadata["allowed_push_host"] = "https://rubygems.org" + + spec.metadata["homepage_uri"] = spec.homepage + spec.metadata["source_code_uri"] = spec.homepage + spec.metadata["changelog_uri"] = "https://github.com/jonmagic/names_dataset_ruby/blob/main/CHANGELOG.md" + + # Specify which files should be added to the gem when it is released. + # The `git ls-files -z` loads the files in the RubyGem that have been added into git. + gemspec = File.basename(__FILE__) + spec.files = IO.popen(%w[git ls-files -z], chdir: __dir__, err: IO::NULL) do |ls| + ls.readlines("\x0", chomp: true).reject do |f| + (f == gemspec) || + f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile]) + end + end + spec.require_paths = ["lib"] + + # Uncomment to register a new dependency of your gem + # spec.add_dependency "example-gem", "~> 1.0" + spec.add_dependency "iso_country_codes", "~> 0.7.6" + spec.add_dependency "rubyzip", "~> 2.3" + + # For more information and examples about making a new gem, check out our + # guide at: https://bundler.io/guides/creating_gem.html +end diff --git a/test/test_helper.rb b/test/test_helper.rb new file mode 100644 index 0000000..f9c7354 --- /dev/null +++ b/test/test_helper.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +$LOAD_PATH.unshift File.expand_path("../lib", __dir__) +require "names_dataset" + +require "minitest/autorun" diff --git a/test/test_names_dataset.rb b/test/test_names_dataset.rb new file mode 100644 index 0000000..8c13f2c --- /dev/null +++ b/test/test_names_dataset.rb @@ -0,0 +1,98 @@ +# frozen_string_literal: true + +require "json" +require "names_dataset" +require "test_helper" +require "zip" + +class TestNamesDataset < Minitest::Test + def setup + # Create sample data for first names and last names + @first_names_data = { + "John" => { + "country" => {"US" => 90}, + "gender" => {"M" => 1.0}, + "rank" => {"US" => 1} + }, + "Jane" => { + "country" => {"US" => 10}, + "gender" => {"F" => 1.0}, + "rank" => {"US" => 2} + } + } + @last_names_data = { + "Doe" => { + "country" => {"US" => 100}, + "gender" => {}, + "rank" => {"US" => 1} + } + } + + # Mock zip files + @first_names_zip = create_mock_zip(@first_names_data) + @last_names_zip = create_mock_zip(@last_names_data) + + # Initialize the dataset + @names_dataset = NamesDataset.new( + first_names_path: @first_names_zip.path, + last_names_path: @last_names_zip.path + ) + end + + def teardown + @first_names_zip.unlink + @last_names_zip.unlink + end + + def test_initialization + refute_nil @names_dataset.first_names + refute_nil @names_dataset.last_names + assert_equal @first_names_data, @names_dataset.first_names + assert_equal @last_names_data, @names_dataset.last_names + end + + def test_search + result = @names_dataset.search("John") + assert_equal "United States of America", result[:first_name]["country"].keys.first + assert_equal "Male", result[:first_name]["gender"].keys.first + + result = @names_dataset.search("Unknown") + assert_empty result[:first_name]["country"] + assert_empty result[:last_name]["country"] + end + + def test_get_country_codes + codes = @names_dataset.get_country_codes + assert_includes codes, "US" + + country_names = @names_dataset.get_country_codes(alpha_2: false) + assert_includes country_names, "United States of America" + end + + def test_get_top_names + top_names = @names_dataset.get_top_names(n: 1, gender: "Male") + assert_includes top_names["US"]["M"], "John" + + top_names_female = @names_dataset.get_top_names(n: 1, gender: "Female") + assert_includes top_names_female["US"]["F"], "Jane" + end + + def test_normalize + assert_equal "John", @names_dataset.normalize(" john ") + assert_equal "Jane", @names_dataset.normalize("JANE") + end + + private + + # Helper to create a mock zip file with the given data + def create_mock_zip(data) + require "tempfile" + tempfile = Tempfile.new(["names", ".zip"]) + + Zip::File.open(tempfile.path, Zip::File::CREATE) do |zipfile| + zipfile.get_output_stream("names.json") { |f| f.write(data.to_json) } + end + + tempfile + end +end